Module:Ancient Greek
From KYNNpedia
Documentation for this module may be created at Module:Ancient Greek/doc
local p = {} local macron = mw.ustring.char(0x304) local breve = mw.ustring.char(0x306) local rough = mw.ustring.char(0x314) local smooth = mw.ustring.char(0x313) local diaeresis = mw.ustring.char(0x308) local acute = mw.ustring.char(0x301) local grave = mw.ustring.char(0x300) local circumflex = mw.ustring.char(0x342) local Latin_circumflex = mw.ustring.char(0x302) local subscript = mw.ustring.char(0x345) local macron_circumflex = macron .. diaeresis .. '?' .. Latin_circumflex local is_velar = { ['κ'] = true, ['γ'] = true, ['χ'] = true, ['ξ'] = true, } local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*" local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ local info = {} -- The tables are shared among different characters so that they can be checked -- for equality if needed, and to use less space. local vowel = { vowel = true, diacritic_seat = true } local iota = { vowel = true, diacritic_seat = true, offglide = true } local upsilon = { vowel = true, diacritic_seat = true, offglide = true } -- Technically rho is only a seat for rough or smooth breathing. local rho = { consonant = true, diacritic_seat = true } local consonant = { consonant = true } local diacritic = { diacritic = true } -- Needed for equality comparisons. local breathing = { diacritic = true } local function add_info(characters, t) if type(characters) == "string" then for character in string.gmatch(characters, UTF8_char) do info[character] = t end else for _, character in ipairs(characters) do info[character] = t end end end add_info({ macron, breve, diaeresis, acute, grave, circumflex, subscript, }, diacritic) add_info({rough, smooth}, breathing) add_info("ΑΕΗΟΩαεηοω", vowel) add_info("Ιι", iota) add_info("Υυ", upsilon) add_info("ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨϜϘϺϷͶϠβγδζθκλμνξπρσςτφχψϝϙϻϸͷϡ", consonant) add_info("Ρρ", rho) local not_recognized = {} setmetatable(info, { __index = function() return not_recognized end }) local function quote(str) return "“" .. str .. "”" end local correspondences = { -- Vowels ["α"] = "a", ["ε"] = "e", ["η"] = "e" .. macron, ["ι"] = "i", ["ο"] = "o", ["υ"] = "u", ["ω"] = "o" .. macron, -- Consonants ["β"] = "b", ["γ"] = "g", ["δ"] = "d", ["ζ"] = "z", ["θ"] = "th", ["κ"] = "k", ["λ"] = "l", ["μ"] = "m", ["ν"] = "n", ["ξ"] = "x", ["π"] = "p", ["ρ"] = "r", ["σ"] = "s", ["ς"] = "s", ["τ"] = "t", ["φ"] = "ph", ["ψ"] = "ps", -- Archaic letters ["ϝ"] = "w", ["ϻ"] = "ś", ["ϙ"] = "q", ["ϡ"] = "š", ["ͷ"] = "v", -- Diacritics [smooth] = '', [rough] = '', -- h is added below in the `transliterate` function. [breve] = '', } local ALA_LC = { ["χ"] = "ch", [acute] = '', [grave] = '', [circumflex] = '', [subscript] = '', [diaeresis] = '', [macron] = '', } local Wiktionary_transliteration = { ["χ"] = "kh", [circumflex] = Latin_circumflex, [subscript] = 'i', } local function add_index_metamethod(t, index_metamethod) local mt = getmetatable(t) if not mt then mt = {} setmetatable(t, mt) end mt.__index = index_metamethod end --[=[ This breaks a word into meaningful "tokens", which are individual letters or diphthongs with their diacritics. Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]]. --]=] local function tokenize(text) local tokens, vowel_info, prev_info = {}, {}, {} local token_i = 1 local prev for character in string.gmatch(mw.ustring.toNFD(text), UTF8_char) do local curr_info = info[character] -- Split vowels between tokens if not a diphthong. if curr_info.vowel then if prev and (not (curr_info.offglide and prev_info.vowel) -- υυ → υ, υ -- ιυ → ι, υ or prev_info.offglide and curr_info == upsilon) then token_i = token_i + 1 end tokens[token_i] = (tokens[token_i] or "") .. character table.insert(vowel_info, { index = token_i }) elseif curr_info.diacritic then tokens[token_i] = (tokens[token_i] or "") .. character if prev_info.vowel or prev_info.diacritic then if character == diaeresis then -- Current token is vowel, vowel, possibly other diacritics, -- and a diaeresis. -- Split the current token into two: -- the first letter, then the second letter plus any diacritics. local previous_vowel, vowel_with_diaeresis = string.match(tokens[token_i], "^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)") if previous_vowel then tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis token_i = token_i + 1 end end elseif prev_info == rho then if curr_info ~= breathing then return string.format("The character %s cannot have the accent %s on it.", prev, "◌" .. character) end else error("The character " .. quote(prev) .. " cannot have a diacritic on it.") end elseif curr_info == rho then if prev and not (prev_info == breathing and info[string.match(tokens[token_i], "^" .. basic_Greek)] == rho) then token_i = token_i + 1 end tokens[token_i] = (tokens[token_i] or "") .. character else if prev then token_i = token_i + 1 end tokens[token_i] = (tokens[token_i] or "") .. character end prev = character prev_info = curr_info end return tokens end function p.transliterate(text, system) add_index_metamethod(correspondences, system == "ALA-LC" and ALA_LC or Wiktionary_transliteration) if text == '῾' then return 'h' end text = mw.ustring.toNFD(text) --[[ Replace semicolon or Greek question mark with regular question mark, except after an ASCII alphanumeric character (to avoid converting semicolons in HTML entities). --]] text = mw.ustring.gsub(text, "([^A-Za-z0-9])[;" .. mw.ustring.char(0x37E) .. "]", "%1?") -- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common. text = text:gsub("·", ";") local tokens = tokenize(text) --now read the tokens local output = {} for i, token in pairs(tokens) do -- substitute each character in the token for its transliteration local translit = string.gsub(mw.ustring.lower(token), UTF8_char, correspondences) if token == 'γ' and is_velar[tokens[i + 1]] then -- γ before a velar should be <n> translit = 'n' elseif token == 'ρ' and tokens[i - 1] == 'ρ' then -- ρ after ρ should be <rh> translit = 'rh' elseif system == "Wiktionary" and mw.ustring.find(token, '^[αΑ].*' .. subscript .. '$') then -- add macron to ᾳ translit = mw.ustring.gsub(translit, '([aA])', '%1' .. macron) end if token:find(rough) then if mw.ustring.find(token, '[Ρρ]') then translit = translit .. 'h' else -- vowel translit = 'h' .. translit end end if system == "ALA-LC" and mw.ustring.find(token, '^[υΥ][^ιΙ]*$') then translit = translit:gsub('u', 'y'):gsub('U', 'Y') end -- Remove macron from a vowel that has a circumflex. if mw.ustring.find(translit, macron_circumflex) then translit = translit:gsub(macron, '') end -- Capitalize first character of transliteration. if token ~= mw.ustring.lower(token) then translit = mw.ustring.gsub(translit, "^.", mw.ustring.upper) end table.insert(output, translit) end return table.concat(output) end function p.translit(frame) local text = frame.args[1] or frame:getParent().args[1] local system = frame.args.system if system == nil or system == "" then system = "Wiktionary" elseif not (system == "ALA-LC" or system == "Wiktionary") then error('Transliteration system in |system= not recognized; choose between "ALA-LC" and "Wiktionary"') end local transliteration = p.transliterate(text, system) return '<span title="Ancient Greek transliteration" lang="grc-Latn"><i>' .. transliteration .. '</i></span>' end function p.bare_translit(frame) return p.transliterate(frame.args[1] or frame:getParent().args[1]) end return p