-- Author: Benwing, based on early version by CodeCat.
--[[
FIXME: Nouns/adjectives to create to exemplify complex declensions:
-- riḍan (رِضًا or رِضًى)
--]]
local m_utilities = require("Module:utilities")
local m_links = require("Module:links")
local ar_utilities = require("Module:ar-utilities")
local lang = require("Module:languages").getByCode("ar")
local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
-- This is used in place of a transliteration when no manual
-- translit is specified and we're unable to automatically generate
-- one (typically because some vowel diacritics are missing).
local BOGUS_CHAR = u(0xFFFD)
-- hamza variants
local HAMZA = u(0x0621) -- hamza on the line (stand-alone hamza) = ء
local HAMZA_ON_ALIF = u(0x0623)
local HAMZA_ON_W = u(0x0624)
local HAMZA_UNDER_ALIF = u(0x0625)
local HAMZA_ON_Y = u(0x0626)
local HAMZA_ANY = "[" .. HAMZA .. HAMZA_ON_ALIF .. HAMZA_UNDER_ALIF .. HAMZA_ON_W .. HAMZA_ON_Y .. "]"
local HAMZA_PH = u(0xFFF0) -- hamza placeholder
-- various letters
local ALIF = u(0x0627) -- ʾalif = ا
local AMAQ = u(0x0649) -- ʾalif maqṣūra = ى
local AMAD = u(0x0622) -- ʾalif madda = آ
local TAM = u(0x0629) -- tāʾ marbūṭa = ة
local T = u(0x062A) -- tāʾ = ت
local HYPHEN = u(0x0640)
local N = u(0x0646) -- nūn = ن
local W = u(0x0648) -- wāw = و
local Y = u(0x064A) -- yā = ي
-- diacritics
local A = u(0x064E) -- fatḥa
local AN = u(0x064B) -- fatḥatān (fatḥa tanwīn)
local U = u(0x064F) -- ḍamma
local UN = u(0x064C) -- ḍammatān (ḍamma tanwīn)
local I = u(0x0650) -- kasra
local IN = u(0x064D) -- kasratān (kasra tanwīn)
local SK = u(0x0652) -- sukūn = no vowel
local SH = u(0x0651) -- šadda = gemination of consonants
local DAGGER_ALIF = u(0x0670)
local DIACRITIC_ANY_BUT_SH = "[" .. A .. I .. U .. AN .. IN .. UN .. SK .. DAGGER_ALIF .. "]"
-- common combinations
local NA = N .. A
local NI = N .. I
local AH = A .. TAM
local AT = A .. T
local AA = A .. ALIF
local AAMAQ = A .. AMAQ
local AAH = AA .. TAM
local AAT = AA .. T
local II = I .. Y
local IIN = II .. N
local IINA = II .. NA
local IY = II
local UU = U .. W
local UUN = UU .. N
local UUNA = UU .. NA
local AY = A .. Y
local AW = A .. W
local AYSK = AY .. SK
local AWSK = AW .. SK
local AAN = AA .. N
local AANI = AA .. NI
local AYN = AYSK .. N
local AYNI = AYSK .. NI
local AWN = AWSK .. N
local AWNA = AWSK .. NA
local AYNA = AYSK .. NA
local AYAAT = AY .. AAT
local UNU = "[" .. UN .. U .. "]"
-- optional diacritics/letters
local AOPT = A .. "?"
local AOPTA = A .. "?" .. ALIF
local IOPT = I .. "?"
local UOPT = U .. "?"
local UNOPT = UN .. "?"
local UNUOPT = UNU .. "?"
local SKOPT = SK .. "?"
-- lists of consonants
-- exclude tāʾ marbūṭa because we don't want it treated as a consonant
-- in patterns like أَفْعَل
local consonants_needing_vowels_no_tam = "بتثجحخدذرزسشصضطظعغفقكلمنهپچڤگڨڧأإؤئء"
-- consonants on the right side; includes alif madda
local rconsonants_no_tam = consonants_needing_vowels_no_tam .. "ويآ"
-- consonants on the left side; does not include alif madda
local lconsonants_no_tam = consonants_needing_vowels_no_tam .. "وي"
local CONS = "[" .. lconsonants_no_tam .. "]"
local CONSPAR = "([" .. lconsonants_no_tam .. "])"
local LRM = u(0x200E) --left-to-right mark
-- First syllable or so of elative/color-defect adjective
local ELCD_START = "^" .. HAMZA_ON_ALIF .. AOPT .. CONSPAR
local export = {}
--------------------
-- Utility functions
--------------------
function ine(x) -- If Not Empty
if x == nil then
return nil
elseif rfind(x, '^".*"$') then
local ret = rmatch(x, '^"(.*)"$')
return ret
elseif rfind(x, "^'.*'$") then
local ret = rmatch(x, "^'(.*)'$")
return ret
elseif x == "" then
return nil
else
return x
end
end
-- Compare two items, recursively comparing arrays.
-- FIXME, doesn't work for tables that aren't arrays.
function equals(x, y)
if type(x) == "table" and type(y) == "table" then
if #x ~= #y then
return false
end
for key, value in ipairs(x) do
if not equals(value, y[key]) then
return false
end
end
return true
end
return x == y
end
-- true if array contains item
function contains(tab, item)
for _, value in pairs(tab) do
if equals(value, item) then
return true
end
end
return false
end
-- append to array if element not already present
function insert_if_not(tab, item)
if not contains(tab, item) then
table.insert(tab, item)
end
end
-- version of rsubn() that discards all but the first return value
function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- version of rsub() that asserts that a match occurred
function assert_rsub(term, foo, bar)
local retval, numsub = rsubn(term, foo, bar)
assert(numsub > 0)
return retval
end
function make_link(arabic)
--return m_links.full_link(nil, arabic, lang, nil, "term", nil, {tr = "-"}, false)
return m_links.full_link({lang = lang, alt = arabic}, "term")
end
function track(page)
require("Module:debug").track("ar-nominals/" .. page)
return true
end
-------------------------------------
-- Functions for building inflections
-------------------------------------
-- Functions that do the actual inflecting by creating the forms of a basic term.
local inflections = {}
local max_mods = 9 -- maximum number of modifiers
local mod_list = {"mod"} -- list of "mod", "mod2", "mod3", ...
for i=2,max_mods do
table.insert(mod_list, "mod" .. i)
end
-- Create and return the 'data' structure that will hold all of the
-- generated declensional forms, as well as other ancillary information
-- such as the possible numbers, genders and cases the the actual numbers
-- and states to store (in 'data.numbers' and 'data.states' respectively).
function init_data()
-- FORMS contains a table of forms for each inflectional category,
-- e.g. "nom_sg_ind" for nouns or "nom_m_sg_ind" for adjectives. The value
-- of an entry is an array of alternatives (e.g. different plurals), where
-- each alternative is either a string of the form "ARABIC" or
-- "ARABIC/TRANSLIT", or an array of such strings (this is used for
-- alternative spellings involving different hamza seats,
-- e.g. مُبْتَدَؤُون or مُبْتَدَأُون). Alternative hamza spellings are separated
-- in display by an "inner separator" (/), while alternatives on
-- the level of different plurals are separated by an "outer separator" (;).
return {forms = {}, title = nil, categories = {},
allgenders = {"m", "f"},
allstates = {"ind", "def", "con"},
allnumbers = {"sg", "du", "pl"},
states = {}, -- initialized later
numbers = {}, -- initialized later
engnumbers = {sg="số ít", du="số kép", pl="số nhiều",
coll="danh từ tập hợp", sing="danh từ số đơn lẻ", pauc="danh từ số vài"},
engnumberscap = {sg="Số ít", du="Số đôi", pl="Số nhiều",
coll="Danh từ tập hợp", sing="Số đơn lẻ", pauc="Số vài (3-10)"},
allcases = {"nom", "acc", "gen", "inf"},
allcases_with_lemma = {"nom", "acc", "gen", "inf", "lemma"},
-- index into endings array indicating correct ending for given
-- combination of state and case
statecases = {
ind = {nom = 1, acc = 2, gen = 3, inf = 10, lemma = 13},
def = {nom = 4, acc = 5, gen = 6, inf = 11, lemma = 14},
-- used for a definite adjective modifying a construct-state noun
defcon = {nom = 4, acc = 5, gen = 6, inf = 11, lemma = 14},
con = {nom = 7, acc = 8, gen = 9, inf = 12, lemma = 15},
},
}
end
-- Initialize and return ARGS, ORIGARGS and DATA (see init_data()).
-- ARGS is a table of user-supplied arguments, massaged from the original
-- arguments by converting empty-string arguments to nil and appending
-- translit arguments to their base arguments with a separating slash.
-- ORIGARGS is the original table of arguments.
function init(origargs)
-- Massage arguments by converting empty arguments to nil, and
-- "" or '' arguments to empty.
local args = {}
for k, v in pairs(origargs) do
args[k] = ine(v)
end
-- Further massage arguments by appending translit arguments to the
-- corresponding base arguments, with a slash separator, as is expected
-- in the rest of the code.
--
-- FIXME: We should consider separating translit and base arguments by the
-- separators ; , | (used in overrides; see handle_lemma_and_overrides())
-- and matching up individual parts, to allow separate translit arguments
-- to be specified for overrides. But maybe not; the point of allowing
-- separate translit arguments is for compatibility with headword
-- templates such as "ar-noun" and "ar-adj", and those templates don't
-- handle override arguments.
local function dotr(arg, argtr)
if not args[arg] then
error("Argument '" .. argtr .."' specified but not corresponding base argument '" .. arg .. "'")
end
args[arg] = args[arg] .. "/" .. args[argtr]
end
-- By convention, corresponding to arg 1 is tr; corresponding to
-- head2, head3, ... is tr2, tr3, ...; corresponding to
-- modhead2, modhead3, ... is modtr2, modtr3, ...; corresponding to
-- modNhead2, modNhead3, ... is modNtr2, modNtr3, ..; corresponding to
-- all other arguments FOO, FOO2, ... is FOOtr, FOO2tr, ...
for k, v in pairs(args) do
if k == "tr" then
dotr(1, "tr")
elseif rfind(k, "tr[0-9]+$") then
dotr(assert_rsub(k, "tr([0-9]+)$", "head%1"), k)
elseif rfind(k, "tr$") then
dotr(assert_rsub(k, "tr$", ""), k)
end
end
-- Construct data.
local data = init_data()
return args, origargs, data
end
-- Parse the user-specified state spec and other related arguments. The
-- user can specify, using idafaN=, how modifiers are related to previous
-- words. The user can also manually specify which states are to appear;
-- whether to omit the definite article in the definite state; and
-- how/whether to restrict modifiers to a particular state, case or number.
-- Normally the modN_* parameters and basestate= do not need to be set
-- directly; instead, use idafaN=. It may be necessary to explicitly
-- specify state= in the presence of proper nouns or definite-only
-- adjectival expressions. NOTE: At the time this function is called,
-- data.numbers has not yet been initialized.
function parse_state_etc_spec(data, args)
local function check(arg, dataval, allvalues)
if args[arg] then
if not contains(allvalues, args[arg]) then
error("For " .. arg .. "=, value '" .. args[arg] .. "' should be one of " ..
table.concat(allvalues, ", "))
end
data[dataval] = args[arg]
end
end
local function check_boolean(arg, dataval)
check(arg, dataval, {"yes", "no"})
if data[dataval] == "yes" then
data[dataval] = true
elseif data[dataval] == "no" then
data[dataval] = false
end
end
-- Make sure no holes in mod values
for i=1,(#mod_list)-1 do
if args[mod_list[i+1]] and not args[mod_list[i]] then
error("Hole in modifier arguments -- " .. mod_list[i+1] ..
" present but not " .. mod_list[i])
end
end
-- FIXME! Remove this once we're sure there are no instances of mod2
-- that haven't been converted to modhead2.
if args["mod2"] then
track("mod2")
end
-- Set default value; may be overridden e.g. by arg["state"] or
-- by idafaN=.
data.states = data.allstates
-- List of pairs of idafaN/modN parameters
local idafa_mod_list = {{"idafa", "mod"}}
for i=2,max_mods do
table.insert(idafa_mod_list, {"idafa" .. i, "mod" .. i})
end
-- True if the value of an |idafa= param is a valid adjectival modifier
-- value.
local function valid_adjectival_idafaval(idafaval)
return idafaval == "adj" or idafaval == "adj-base" or
idafaval == "adj-mod" or rfind(idafaval, "^adj%-mod[0-9]+$")
end
-- Extract the referent (base or modifier) of an adjectival |idafa= param.
-- Assumes the value is valid.
local function adjectival_idafaval_referent(idafaval)
if idafaval == "adj" then
return "base"
end
return assert_rsub(idafaval, "^adj%-", "")
end
-- Convert a base/mod spec to an index: 0=base, 1=mod, 2=mod2, etc.
local function basemod_to_index(basemod)
if basemod == "base" then return 0 end
if basemod == "mod" then return 1 end
return tonumber(assert_rsub(basemod, "^mod", ""))
end
-- Recognize idafa spec and handle it.
-- We do the following:
-- (1) Check that if idafaN= is given, then modN= is also given.
-- (2) Check that adjectival modifiers aren't followed by idafa modifiers.
-- (3) Check that adjectival modifiers are modifying the base or an
-- ʾidāfa modifier, not another adjectival modifier.
-- (4) Support idafa values "adj-base", "adj-mod", "adj-mod2", "adj"
-- (="adj-base") etc. and check that we're referring to an earlier
-- word.
-- (5) For ʾidāfa modifiers, set basestate=con, set modN_case=gen,
-- set modN_idafa=true, and set modN_number to the number specified
-- in the parameter value (e.g. 'sg' or 'def-pl'); and if the
-- parameter value specifies a state (e.g. 'def' or 'ind-du'),
-- set modN_state= to this value, and if this is the last ʾidāfa
-- modifier, also set state= to this value; if this is not the last
-- ʾidāfa modifier, set modN_state=con and disallow a state to be
-- specified in the parameter value.
-- (6) For adjectival modifiers of the base, do nothing.
-- (7) For adjectival modifiers of ʾidāfa modifiers, set modN_case=gen;
-- set modN_idafa=false; and set modN_number=, modN_numgen= and
-- modN_state= to match the values of the idafa modifier.
-- error checking and find last ʾidāfa modifier
local last_is_idafa = true
local last_idafa_mod = "base"
for _, idafa_mod in ipairs(idafa_mod_list) do
local idafaparam = idafa_mod[1]
local mod = idafa_mod[2]
local idafaval = args[idafaparam]
if idafaval then
local paramval = idafaparam .. "=" .. idafaval
if not args[mod] then
error("'" .. idafaparam .. "' parameter without corresponding '"
.. mod .. "' parameter")
end
if not valid_adjectival_idafaval(idafaval) then
-- We're a construct (ʾidāfa) modifier
if not last_is_idafa then
error("ʾidāfa modifier " .. paramval .. " follows adjectival modifier")
end
last_idafa_mod = mod
else
last_is_idafa = false
local adjref = adjectival_idafaval_referent(idafaval)
if adjref ~= "base" then
if basemod_to_index(adjref) >= basemod_to_index(mod) then
error(paramval .. " can only refer to an earlier element")
end
local idafaref = assert_rsub(adjref, "^mod", "idafa")
if not args[idafaref] then
error(paramval .. " cannot refer to a missing modifier")
elseif valid_adjectival_idafaval(args[idafaref]) then
error(paramval .. " cannot refer to an adjectival modifier")
end
end
end
end
end
-- Now go through and set all the modN_ data values appropriately.
for _, idafa_mod in ipairs(idafa_mod_list) do
local idafaparam = idafa_mod[1]
local mod = idafa_mod[2]
local idafaval = args[idafaparam]
if idafaval then
local paramval = idafaparam .. "=" .. idafaval
local bad_idafa = true
if idafaval == "yes" then
idafaval = "sg"
end
if idafaval == "ind-def" or contains(data.allstates, idafaval) then
idafaval = idafaval .. "-sg"
end
if not idafaval then
bad_idafa = false
elseif valid_adjectival_idafaval(idafaval) then
local adjref = adjectival_idafaval_referent(idafaval)
if adjref ~= "base" then
data[mod .. "_case"] = "gen"
data[mod .. "_state"] = data[adjref .. "_state"]
-- if agreement is with ind-def, make it def
if data[mod .. "_state"] == "ind-def" then
data[mod .. "_state"] = "def"
end
data[mod .. "_number"] = data[adjref .. "_number"]
data[mod .. "_numgen"] = data[adjref .. "_numgen"]
data[mod .. "_idafa"] = false
end
bad_idafa = false
elseif contains(data.allnumbers, idafaval) then
data.basestate = "con"
data[mod .. "_case"] = "gen"
data[mod .. "_number"] = idafaval
data[mod .. "_idafa"] = true
if mod ~= last_idafa_mod then
data[mod .. "_state"] = "con"
end
bad_idafa = false
elseif rfind(idafaval, "%-") then
local state_num = rsplit(idafaval, "%-")
-- Support ind-def as a possible value. We set modstate to
-- ind-def, which will signal definite agreement with adjectival
-- modifiers; then later on we change the value to ind.
if #state_num == 3 and state_num[1] == "ind" and state_num[2] == "def" then
state_num[1] = "ind-def"
state_num[2] = state_num[3]
table.remove(state_num)
end
if #state_num == 2 then
local state = state_num[1]
local num = state_num[2]
if (state == "ind-def" or contains(data.allstates, state))
and contains(data.allnumbers, num) then
if mod == last_idafa_mod then
if state == "ind-def" then
data.states = {"def"}
else
data.states = {state}
end
else
error(paramval .. " cannot specify a state because it is not the last ʾidāfa modifier")
end
data.basestate = "con"
data[mod .. "_case"] = "gen"
data[mod .. "_state"] = state
data[mod .. "_number"] = num
data[mod .. "_idafa"] = true
bad_idafa = false
end
end
end
if bad_idafa then
error(paramval .. " should be one of yes, def, sg, def-sg, adj, adj-base, adj-mod, adj-mod2 or similar")
end
end
end
if args["state"] == "ind-def" then
data.states = {"def"}
data.basestate = "ind"
elseif args["state"] then
data.states = rsplit(args["state"], ",")
for _, state in ipairs(data.states) do
if not contains(data.allstates, state) then
error("For state=, value '" .. state .. "' should be one of " ..
table.concat(data.allstates, ", "))
end
end
end
-- Now process explicit settings, so that they can override the
-- settings based on idafaN=.
check("basestate", "basestate", data.allstates)
check_boolean("noirreg", "noirreg")
check_boolean("omitarticle", "omitarticle")
data.prefix = args.prefix
for _, mod in ipairs(mod_list) do
check(mod .. "state", mod .. "_state", data.allstates)
check(mod .. "case", mod .. "_case", data.allcases)
check(mod .. "number", mod .. "_number", data.allnumgens)
check(mod .. "numgen", mod .. "_numgen", data.allnumgens)
check_boolean(mod .. "idafa", mod .. "_idafa")
check_boolean(mod .. "omitarticle", mod .. "_omitarticle")
data[mod .. "_prefix"] = args[mod .. "prefix"]
end
-- Make sure modN_numgen is initialized, to modN_number if necessary.
-- This simplifies logic in certain places, e.g. call_inflections().
-- Also convert ind-def to ind.
for _, mod in ipairs(mod_list) do
data[mod .. "_numgen"] = data[mod .. "_numgen"] or data[mod .. "_number"]
if data[mod .. "_state"] == "ind-def" then
data[mod.. "_state"] = "ind"
end
end
end
-- Parse the user-specified number spec. The user can manually specify which
-- numbers are to appear. Return true if |number= was specified.
function parse_number_spec(data, args)
if args["number"] then
data.numbers = rsplit(args["number"], ",")
for _, num in ipairs(data.numbers) do
if not contains(data.allnumbers, num) then
error("For number=, value '" .. num .. "' should be one of " ..
table.concat(data.allnumbers, ", "))
end
end
return true
else
data.numbers = data.allnumbers
return false
end
end
-- Determine which numbers will appear using the logic for nouns.
-- See comment just below.
function determine_noun_numbers(data, args, pls)
-- Can manually specify which numbers are to appear, and exactly those
-- numbers will appear. Otherwise, if any plurals given, duals and plurals
-- appear; else, only singular (on the assumption that the word is a proper
-- noun or abstract noun that exists only in the singular); however,
-- singular won't appear if "-" given for singular, and similarly for dual.
if not parse_number_spec(data, args) then
data.numbers = {}
local sgarg1 = args[1]
local duarg1 = args["d"]
if sgarg1 ~= "-" then
table.insert(data.numbers, "sg")
end
if #pls["base"] > 0 then
-- Dual appears if either: explicit dual stem (not -) is given, or
-- default dual is used and explicit singular stem (not -) is given.
if (duarg1 and duarg1 ~= "-") or (not duarg1 and sgarg1 ~= "-") then
table.insert(data.numbers, "du")
end
table.insert(data.numbers, "pl")
elseif duarg1 and duarg1 ~= "-" then
-- If explicit dual but no plural given, include it. Useful for
-- dual tantum words.
table.insert(data.numbers, "du")
end
end
end
-- For stem STEM, convert to stem-and-type format and insert stem and type
-- into RESULTS, checking to make sure it's not already there. SGS is the
-- list of singular items to base derived forms off of (masculine or feminine
-- as appropriate), an array of length-two arrays of {COMBINED_STEM, TYPE} as
-- returned by stem_and_type(); ISFEM is true if this is feminine gender;
-- NUM is "sg", "du" or "pl". POS is the part of speech, generally "noun" or
-- "adjective".
function insert_stems(stem, results, sgs, isfem, num, pos)
if stem == "-" then
return
end
for _, sg in ipairs(sgs) do
local combined_stem, ty = export.stem_and_type(stem,
sg[1], sg[2], isfem, num, pos)
insert_if_not(results, {combined_stem, ty})
end
end
-- Handle manually specified overrides of individual forms. Separate
-- outer-level alternants with ; or , or the Arabic equivalents; separate
-- inner-level alternants with | (we can't use / because it's already in
-- use separating Arabic from translit).
--
-- Also determine lemma and allow it to be overridden.
-- Also allow POS (part of speech) to be overridden.
function handle_lemma_and_overrides(data, args)
local function handle_override(arg)
if args[arg] then
local ovval = {}
local alts1 = rsplit(args[arg], "[;,؛،]")
for _, alt1 in ipairs(alts1) do
local alts2 = rsplit(alt1, "|")
table.insert(ovval, alts2)
end
data.forms[arg] = ovval
end
end
local function do_overrides(mod)
for _, numgen in ipairs(data.allnumgens) do
for _, state in ipairs(data.allstates) do
for _, case in ipairs(data.allcases) do
local arg = mod .. case .. "_" .. numgen .. "_" .. state
handle_override(arg)
if args[arg] and not data.noirreg then
insert_cat(data, mod, numgen,
"Arabic NOUNs with irregular SINGULAR",
"SINGULAR of irregular NOUN")
end
end
end
end
end
do_overrides("")
for _, mod in ipairs(mod_list) do
do_overrides(mod .. "_")
end
local function get_lemma(mod)
for _, numgen in ipairs(data.numgens()) do
for _, state in ipairs(data.states) do
local arg = mod .. "lemma_" .. numgen .. "_" .. state
if data.forms[arg] and #data.forms[arg] > 0 then
return data.forms[arg]
end
end
end
return nil
end
data.forms["lemma"] = get_lemma("")
for _, mod in ipairs(mod_list) do
data.forms[mod .. "_lemma"] = get_lemma(mod .. "_")
end
handle_override("lemma")
for _, mod in ipairs(mod_list) do
handle_override(mod .. "_lemma")
end
end
-- Return the part of speech based on the part of speech contained in
-- data.pos and MOD (either "", "mod_", "mod2_", etc., same as in
-- do_gender_number_1()). If we're a modifier, don't use data.pos but
-- instead choose based on whether modifier is adjectival or nominal
-- (ʾiḍāfa).
function get_pos(data, mod)
local ismod = mod ~= ""
if not ismod then
return data.pos
elseif data[mod .. "idafa"] then
return "noun"
else
return "adjective"
end
end
-- Find the stems associated with a particular gender/number combination.
-- ARGS is the set of all arguments. ARGPREFS is an array of argument prefixes
-- (e.g. "f" for the actual arguments "f", "f2", ..., for the feminine
-- singular; we allow more than one to handle "cpl"). SGS is a
-- "stem-type list" (see do_gender_number()), and is the list of stems to
-- base derived forms off of (masculine or feminine as appropriate), an array
-- of length-two arrays of {COMBINED_STEM, TYPE} as returned by
-- stem_and_type(). DEFAULT, ISFEM and NUM are as in do_gender_number().
-- MOD is either "", "mod_", "mod2_", etc. depending if we're working on a
-- base or modifier argument (in the latter case, basically if the argument
-- begins with "mod").
function do_gender_number_1(data, args, argprefs, sgs, default, isfem, num, mod)
local results = {}
local function handle_stem(stem)
insert_stems(stem, results, sgs, isfem, num, get_pos(data, mod))
end
-- If no arguments specified, use the default instead.
need_default = true
for _, argpref in ipairs(argprefs) do
if args[argpref] then
need_default = false
break
end
end
if need_default then
if not default then
return results
end
handle_stem(default)
return results
end
-- For explicitly specified arguments, make sure there's at least one
-- stem to generate off of; otherwise specifying e.g. 'sing=- pauc=فُلَان'
-- won't override paucal.
if #sgs == 0 then
sgs = {{"", ""}}
end
for _, argpref in ipairs(argprefs) do
if args[argpref] then
handle_stem(args[argpref])
end
local i = 2
while args[argpref .. i] do
handle_stem(args[argpref .. i])
i = i + 1
end
end
return results
end
-- For a given gender/number combination, parse and return the full set
-- of stems for both base and modifier. The return value is a
-- "stem specification", i.e. table with a "base" key for the base, a
-- "mod" key for the first modifier (see below), a "mod2" key for the
-- second modifier, etc. listing all stems for both the base and modifier(s).
-- The value of each key is a "stem-type list", i.e. an array of stem-type
-- pairs, where each element is a size-two array of {COMBINED_STEM, STEM_TYPE}.
-- COMBINED_STEM is a stem with attached transliteration in the form
-- STEM/TRANSLIT (where the transliteration is either manually specified in
-- the stem argument, e.g. 'pl=لُورْدَات/lordāt', or auto-transliterated from
-- the Arabic, with BOGUS_CHAR substituting for the transliteration if
-- auto-translit fails). STEM_TYPE is the declension of the stem, either
-- manually specified, e.g. 'بَبَّغَاء:di' for manually-specified diptote, or
-- auto-detected (see stem_and_type() and detect_type()).
--
-- DATA and ARGS are as in init(). ARGPREFS is an array of the prefixes for
-- the argument(s) specifying the stem (and optional translit and declension
-- type). For a given ARGPREF, we check ARGPREF, ARGPREF2, ARGPREF3, ... in
-- turn for the base, and modARGPREF, modARGPREF2, modARGPREF3, ... in turn
-- for the first modifier, and mod2ARGPREF, mod2ARGPREF2, mod2ARGPREF3, ...
-- for the second modifier, etc. SGS is a stem specification (see above),
-- giving the stems that are used to base derived forms off of (e.g. if a stem
-- type "smp" appears in place of a stem, the sound masculine plural of the
-- stems in SGS will be derived). DEFAULT is a single stem (i.e. a string) that
-- is used when no stems were explicitly given by the user (typically either
-- "f", "m", "d" or "p"), or nil for no default. ISFEM is true if we're
-- accumulating stems for a feminine number/gender category, and NUM is the
-- number (expected to be "sg", "du" or "pl") of the number/gender category
-- we're accumulating stems for.
--
-- About bases and modifiers: Note that e.g. in the noun phrase يَوْم الاِثْنَيْن
-- the head noun يَوْم is the base and the noun الاِثْنَيْن is the modifier.
-- In a noun phrase like البَحْر الأَبْيَض المُتَوَسِّط, there are two modifiers.
-- Note that modifiers come in two varieties, adjectival modifiers and
-- construct (ʾidāfa) modifiers. The first above noun phrase is an example
-- of a noun phrase with a construct modifier, where the base is fixed in
-- the construct state and the modifier is fixed in number and case
-- (which is always genitive) and possibly in state. The second above noun
-- phrase is an example of a noun phrase with two adjectival modifiers.
-- A construct modifier is generally a noun, whereas an adjectival modifier
-- is an adjective that usually agrees in state, number and case with the
-- base noun. (Note that in the case of multiple modifiers, it is possible
-- for e.g. the second modifier to be an adjectival modifier that agrees
-- with the first, construct, modifier, in which case its case will be fixed
-- to genitive, its number will be fixed to the same number as the first
-- modifier and its state will vary or not depending on whether the first
-- modifier's state varies. It is not possible in general to distinguish
-- adjectival and construct modifiers by looking at the values of
-- modN_state, modN_case or modN_number, since e.g. a third modifier could
-- have all of them specified and be either kind. Thus we have modN_idafa,
-- which is true for a construct modifier, false otherwise.)
function do_gender_number(data, args, argprefs, sgs, default, isfem, num)
local results = do_gender_number_1(data, args, argprefs, sgs["base"],
default, isfem, num, "")
basemodtable = {base=results}
for _, mod in ipairs(mod_list) do
local modn_argprefs = {}
for _, argpref in ipairs(argprefs) do
table.insert(modn_argprefs, mod .. argpref)
end
local modn_results = do_gender_number_1(data, args, modn_argprefs,
sgs[mod] or {}, default, isfem, num, mod .. "_")
basemodtable[mod] = modn_results
end
return basemodtable
end
-- Generate inflections for the given combined stem and type, for MOD
-- (either "" if we're working on the base or "mod_", "mod2_", etc. if we're
-- working on a modifier) and NUMGEN (number or number-gender combination,
-- of the sort that forms part of the keys in DATA.FORMS).
function call_inflection(combined_stem, ty, data, mod, numgen)
if ty == "-" then
return
end
if not inflections[ty] then
error("Unknown inflection type '" .. ty .. "'")
end
local ar, tr = split_arabic_tr(combined_stem)
inflections[ty](ar, tr, data, mod, numgen)
end
-- Generate inflections for the stems of a given number/gender combination
-- and for either the base or the modifier. STEMTYPES is a stem-type list
-- (see do_gender_number()), listing all the stems and corresponding
-- declension types. MOD is either "", "mod_", "mod2_", etc. depending on
-- whether we're working on the base or a modifier. NUMGEN is the number or
-- number-gender combination we're working on, of the sort that forms part
-- of the keys in DATA.FORMS, e.g. "sg" or "m_sg".
function call_inflections(stemtypes, data, mod, numgen)
local mod_with_modnumgen = mod ~= "" and data[mod .. "numgen"]
-- If modN_numgen= is given, do nothing if NUMGEN isn't the same
if mod_with_modnumgen and data[mod .. "numgen"] ~= numgen then
return
end
-- always call inflection() if mod_with_modnumgen since it may affect
-- other numbers (cf. يَوْم الاِثْنَيْن)
if mod_with_modnumgen or contains(data.numbers, rsub(numgen, "^.*_", "")) then
for _, stemtype in ipairs(stemtypes) do
call_inflection(stemtype[1], stemtype[2], data, mod, numgen)
end
end
end
-- Generate the entire set of inflections for a noun or adjective.
-- Also handle any manually-specified part of speech and any manual
-- inflection overrides. The value of INFLECTIONS is an array of stem
-- specifications, one per number, where each element is a size-two
-- array of a stem specification (containing the set of stems and
-- corresponding declension types for the base and any modifiers;
-- see do_gender_number()) and a NUMGEN string, i.e. a string identifying
-- the number or number/gender in question (e.g. "sg", "du", "pl",
-- "m_sg", "f_pl", etc.).
function do_inflections_and_overrides(data, args, inflections)
-- do this before generating inflections so POS change is reflected in
-- categories
if args["pos"] then
data.pos = args["pos"]
end
for _, inflection in ipairs(inflections) do
call_inflections(inflection[1]["base"] or {}, data, "", inflection[2])
for _, mod in ipairs(mod_list) do
call_inflections(inflection[1][mod] or {}, data,
mod .. "_", inflection[2])
end
end
handle_lemma_and_overrides(data, args)
end
-- Helper function for get_heads(). Parses the stems for either the
-- base or the modifier (see do_gender_number()). ARG1 is the argument
-- for the first stem and ARGN is the prefix of the arguments for the
-- remaining stems. For example, for the singular base, ARG1=1 and
-- ARGN="head"; for the first singular modifier, ARG1="mod" and
-- ARGN="modhead"; for the plural base, ARG1=ARGN="pl". The arguments
-- other than the first are numbered 2, 3, ..., which is appended to
-- ARGN. MOD is either "", "mod_", "mod2_", etc. depending if we're
-- working on a base or modifier argument. The returned value is an
-- array of stems, where each element is a size-two array of
-- {COMBINED_STEM, STEM_TYPE}. See do_gender_number().
function get_heads_1(data, args, arg1, argn, mod)
if not args[arg1] then
return {}
end
local heads
if args[arg1] == "-" then
heads = {{"", "-"}}
else
heads = {}
insert_stems(args[arg1], heads, {{args[arg1], ""}}, false, "sg",
get_pos(data, mod))
end
local i = 2
while args[argn .. i] do
local arg = args[argn .. i]
insert_stems(arg, heads, {{arg, ""}}, false, "sg",
get_pos(data, mod))
i = i + 1
end
return heads
end
-- Very similar to do_gender_number(), and returns the same type of
-- structure, but works specifically for the stems of the head (the
-- most basic gender/number combiation, e.g. singular for nouns,
-- masculine singular for adjectives and gendered nouns, collective
-- for collective nouns, etc.), including both base and modifier.
-- See do_gender_number(). Note that the actual return value is
-- two items, the first of which is the same type of structure
-- returned by do_gender_number() and the second of which is a boolean
-- indicating whether we were called from within a template documentation
-- page (in which case no user-specified arguments exist and we
-- substitute sample ones). The reason for this boolean is to indicate
-- whether sample arguments need to be substituted for other numbers
-- as well.
function get_heads(data, args, headtype)
if not args[1] and mw.title.getCurrentTitle().nsText == "Template" then
return {base={{"{{{1}}}", "tri"}}}, true
end
if not args[1] then error("Parameter 1 (" .. headtype .. " stem) may not be empty.") end
local base = get_heads_1(data, args, 1, "head", "")
basemodtable = {base=base}
for _, mod in ipairs(mod_list) do
local modn = get_heads_1(data, args, mod, mod .. "head", mod .. "_")
basemodtable[mod] = modn
end
return basemodtable, false
end
-- The main entry point for noun tables.
function export.show_noun(frame)
local args, origargs, data = init(frame:getParent().args)
data.pos = "danh từ"
data.numgens = function() return data.numbers end
data.allnumgens = data.allnumbers
local sgs, is_template = get_heads(data, args, "singular")
local pls = is_template and {base={{"{{{pl}}}", "tri"}}} or
do_gender_number(data, args, {"pl", "cpl"}, sgs, nil, false, "pl")
-- always do dual so cases like يَوْم الاِثْنَيْن work -- a singular with
-- a dual modifier, where data.number refers only the singular
-- but we need to go ahead and compute the dual so it parses the
-- "modd" modifier dual argument. When the modifier dual argument
-- is parsed, it will store the resulting dual declension for اِثْنَيْن
-- in the modifier slot for all numbers, including specifically
-- the singular.
local dus = do_gender_number(data, args, {"d"}, sgs, "d", false, "du")
parse_state_etc_spec(data, args)
determine_noun_numbers(data, args, pls)
do_inflections_and_overrides(data, args,
{{sgs, "sg"}, {dus, "du"}, {pls, "pl"}})
-- Make the table
return make_noun_table(data)
end
function any_feminine(data, stem_spec)
for basemod, stemtypelist in pairs(stem_spec) do
-- Only check modifiers if modN_numgen= not given. If not given, the
-- modifier needs to be declined for all numgens; else only for the
-- given numgen, which should be explicitly specified.
if not (basemod ~= "base" and data[basemod .. "_numgen"]) then
for _, stemtype in ipairs(stemtypelist) do
if rfind(stemtype[1], TAM .. UNUOPT .. "/") then
return true
end
end
end
end
return false
end
function all_feminine(data, stem_spec)
for basemod, stemtypelist in pairs(stem_spec) do
-- Only check modifiers if modN_numgen= not given. If not given, the
-- modifier needs to be declined for all numgens; else only for the
-- given numgen, which should be explicitly specified.
if not (basemod ~= "base" and data[basemod .. "_numgen"]) then
for _, stemtype in ipairs(stemtypelist) do
if not rfind(stemtype[1], TAM .. UNUOPT .. "/") then
return false
end
end
end
end
return true
end
-- The main entry point for collective noun tables.
function export.show_coll_noun(frame)
local args, origargs, data = init(frame:getParent().args)
data.pos = "danh từ"
data.allnumbers = {"coll", "sing", "du", "pauc", "pl"}
data.engnumberscap["pl"] = "Plural of variety"
data.numgens = function() return data.numbers end
data.allnumgens = data.allnumbers
local colls, is_template = get_heads(data, args, "collective")
local pls = is_template and {base={{"{{{pl}}}", "tri"}}} or
do_gender_number(data, args, {"pl", "cpl"}, colls, nil, false, "pl")
parse_state_etc_spec(data, args)
-- If collective noun is already feminine in form, don't try to
-- form a feminine singulative
local collfem = any_feminine(data, colls)
local sings = do_gender_number(data, args, {"sing"}, colls,
not already_feminine and "f" or nil, true, "sg")
local singfem = all_feminine(data, sings)
local dus = do_gender_number(data, args, {"d"}, sings, "d", singfem, "du")
local paucs = do_gender_number(data, args, {"pauc"}, sings, "paucp",
singfem, "pl")
-- Can manually specify which numbers are to appear, and exactly those
-- numbers will appear. Otherwise, if any plurals given, plurals appear,
-- and if singulative given, dual and paucal appear.
if not parse_number_spec(data, args) then
data.numbers = {}
if args[1] ~= "-" then
table.insert(data.numbers, "coll")
end
if #sings["base"] > 0 then
table.insert(data.numbers, "sing")
end
if #dus["base"] > 0 then
table.insert(data.numbers, "du")
end
if #paucs["base"] > 0 then
table.insert(data.numbers, "pauc")
end
if #pls["base"] > 0 then
table.insert(data.numbers, "pl")
end
end
-- Generate the collective, singulative, dual, paucal and plural forms
do_inflections_and_overrides(data, args,
{{colls, "coll"}, {sings, "sing"}, {dus, "du"}, {paucs, "pauc"}, {pls, "pl"}})
-- Make the table
return make_noun_table(data)
end
-- The main entry point for singulative noun tables.
function export.show_sing_noun(frame)
local args, origargs, data = init(frame:getParent().args)
data.pos = "danh từ"
data.allnumbers = {"sing", "coll", "du", "pauc", "pl"}
data.engnumberscap["pl"] = "Plural of variety"
data.numgens = function() return data.numbers end
data.allnumgens = data.allnumbers
parse_state_etc_spec(data, args)
local sings, is_template = get_heads(data, args, "singulative")
-- If all singulative nouns feminine in form, form a masculine collective
local singfem = all_feminine(data, sings)
local colls = do_gender_number(data, args, {"coll"}, sings,
singfem and "m" or nil, false, "sg")
local dus = do_gender_number(data, args, {"d"}, sings, "d", singfem, "du")
local paucs = do_gender_number(data, args, {"pauc"}, sings, "paucp",
singfem, "pl")
local pls = is_template and {base={{"{{{pl}}}", "tri"}}} or
do_gender_number(data, args, {"pl", "cpl"}, colls, nil, false, "pl")
-- Can manually specify which numbers are to appear, and exactly those
-- numbers will appear. Otherwise, if any plurals given, plurals appear;
-- if singulative given or derivable, it and dual and paucal will appear.
if not parse_number_spec(data, args) then
data.numbers = {}
if args[1] ~= "-" then
table.insert(data.numbers, "sing")
end
if #colls["base"] > 0 then
table.insert(data.numbers, "coll")
end
if #dus["base"] > 0 then
table.insert(data.numbers, "du")
end
if #paucs["base"] > 0 then
table.insert(data.numbers, "pauc")
end
if #pls["base"] > 0 then
table.insert(data.numbers, "pl")
end
end
-- Generate the singulative, collective, dual, paucal and plural forms
do_inflections_and_overrides(data, args,
{{sings, "sing"}, {colls, "coll"}, {dus, "du"}, {paucs, "pauc"}, {pls, "pl"}})
-- Make the table
return make_noun_table(data)
end
-- The implementation of the main entry point for adjective and
-- gendered noun tables.
function show_gendered(frame, isadj, pos)
local args, origargs, data = init(frame:getParent().args)
data.pos = pos
data.numgens = function()
local numgens = {}
for _, gender in ipairs(data.allgenders) do
for _, number in ipairs(data.numbers) do
table.insert(numgens, gender .. "_" .. number)
end
end
return numgens
end
data.allnumgens = {}
for _, gender in ipairs(data.allgenders) do
for _, number in ipairs(data.allnumbers) do
table.insert(data.allnumgens, gender .. "_" .. number)
end
end
parse_state_etc_spec(data, args)
local msgs = get_heads(data, args, 'masculine singular')
-- Always do all of these so cases like يَوْم الاِثْنَيْن work.
-- See comment in show_noun().
local fsgs = do_gender_number(data, args, {"f"}, msgs, "f", true, "sg")
local mdus = do_gender_number(data, args, {"d"}, msgs, "d", false, "du")
local fdus = do_gender_number(data, args, {"fd"}, fsgs, "d", true, "du")
local mpls = do_gender_number(data, args, {"pl", "cpl"}, msgs,
isadj and "p" or nil, false, "pl")
local fpls = do_gender_number(data, args, {"fpl", "cpl"}, fsgs, "fp",
true, "pl")
if isadj then
parse_number_spec(data, args)
else
determine_noun_numbers(data, args, mpls)
end
-- Generate the singular, dual and plural forms
do_inflections_and_overrides(data, args,
{{msgs, "m_sg"}, {fsgs, "f_sg"}, {mdus, "m_du"}, {fdus, "f_du"},
{mpls, "m_pl"}, {fpls, "f_pl"}})
-- Make the table
if isadj then
return make_adj_table(data)
else
return make_gendered_noun_table(data)
end
end
-- The main entry point for gendered noun tables.
function export.show_gendered_noun(frame)
return show_gendered(frame, false, "danh từ")
end
-- The main entry point for numeral tables. Same as using show_gendered_noun()
-- with pos=numeral.
function export.show_numeral(frame)
return show_gendered(frame, false, "số")
end
-- The main entry point for adjective tables.
function export.show_adj(frame)
return show_gendered(frame, true, "tính từ")
end
-- Inflection functions
function do_translit(term)
return (lang:transliterate(term)) or track("cant-translit") and BOGUS_CHAR
end
function split_arabic_tr(term)
if term == "" then
return "", ""
elseif not rfind(term, "/") then
return term, do_translit(term)
else
splitvals = rsplit(term, "/")
if #splitvals ~= 2 then
error("Must have at most one slash in a combined Arabic/translit expr: '" .. term .. "'")
end
return splitvals[1], splitvals[2]
end
end
function reorder_shadda(word)
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- detection process inconvenient, so undo it.
word = rsub(word, "(" .. DIACRITIC_ANY_BUT_SH .. ")" .. SH, SH .. "%1")
return word
end
-- Combine PREFIX, AR/TR, and ENDING in that order. PREFIX and ENDING
-- can be of the form ARABIC/TRANSLIT. The Arabic and translit parts are
-- separated out and grouped together, resulting in a string of the
-- form ARABIC/TRANSLIT (TRANSLIT will always be present, computed
-- automatically if not present in the source). The return value is actually a
-- list of ARABIC/TRANSLIT strings because hamza resolution is applied to
-- ARABIC, which may produce multiple outcomes (all of which will have the
-- same TRANSLIT).
function combine_with_ending(prefix, ar, tr, ending)
local prefixar, prefixtr = split_arabic_tr(prefix)
local endingar, endingtr = split_arabic_tr(ending)
-- When calling hamza_seat(), leave out prefixes, which we expect to be
-- clitics like وَ. (In case the prefix is a separate word, it won't matter
-- whether we include it in the text passed to hamza_seat().)
allar = hamza_seat(ar .. endingar)
-- Convert ...īān to ...iyān in case of stems ending in -ī or -ū
-- (e.g. kubrī "bridge").
if rfind(endingtr, "^[aeiouāēīōū]") then
if rfind(tr, "ī$") then
tr = rsub(tr, "ī$", "iy")
elseif rfind(tr, "ū$") then
tr = rsub(tr, "ū$", "uw")
end
end
tr = prefixtr .. tr .. endingtr
allartr = {}
for _, arval in ipairs(allar) do
table.insert(allartr, prefixar .. arval .. "/" .. tr)
end
return allartr
end
-- Combine PREFIX, STEM/TR and ENDING in that order and insert into the
-- list of items in DATA[KEY], initializing it if empty and making sure
-- not to insert duplicates. ENDING can be a list of endings, will be
-- distributed over the remaining parts. PREFIX and/or ENDING can be
-- of the form ARABIC/TRANSLIT (the stem is already split into Arabic STEM
-- and Latin TR). Note that what's inserted into DATA[KEY] is actually a
-- list of ARABIC/TRANSLIT strings; if more than one is present in the list,
-- they represent hamza variants, i.e. different ways of writing a hamza
-- sound, such as مُبْتَدَؤُون vs. مُبْتَدَأُون (see init_data()).
function add_inflection(data, key, prefix, stem, tr, ending)
if data.forms[key] == nil then
data.forms[key] = {}
end
if type(ending) ~= "table" then
ending = {ending}
end
for _, endingval in ipairs(ending) do
insert_if_not(data.forms[key],
combine_with_ending(prefix, stem, tr, endingval))
end
end
-- Form inflections from combination of STEM, with transliteration TR,
-- and ENDINGS (and definite article where necessary, plus any specified
-- prefixes) and store in DATA, for the number or gender/number
-- determined by MOD ("", "mod_", "mod2_", etc.; see call_inflection()) and
-- NUMGEN ("sg", "du", "pl", or "m_sg", "f_pl", etc. for adjectives). ENDINGS
-- is an array of 15 values, each of which is a string or array of
-- alternatives. The order of ENDINGS is indefinite nom, acc, gen; definite
-- nom, acc, gen; construct-state nom, acc, gen; informal indefinite, definite,
-- construct; lemma indefinite, definite, construct. (Normally the lemma is
-- based off of the indefinite, but if the inflection has been restricted to
-- particular states, it comes from one of those states, in the order
-- indefinite, definite, construct.) See also add_inflection() for more info
-- on exactly what is inserted into DATA.
function add_inflections(stem, tr, data, mod, numgen, endings)
stem = canon_hamza(stem)
assert(#endings == 15)
local ismod = mod ~= ""
-- If working on modifier and modN_numgen= is given, it better agree with
-- NUMGEN; the case where it doesn't agree should have been caught in
-- call_inflections().
if ismod and data[mod .. "numgen"] then
assert(data[mod .. "numgen"] == numgen)
end
-- Return a list of combined of ar/tr forms, with the ending tacked on.
-- There may be more than one form because of alternative hamza seats that
-- may be supplied, e.g. مُبْتَدَؤُون or مُبْتَدَأُون (mubtadaʾūn "(grammatical) subjects").
local defstem, deftr
if stem == "?" or data[mod .. "omitarticle"] then
defstem = stem
deftr = tr
else
-- apply sun-letter assimilation and hamzat al-wasl elision
defstem = rsub("الْ" .. stem, "^الْ([سشصتثطدذضزژظنرل])", "ال%1ّ")
defstem = rsub(defstem, "^الْ([اٱ])([ًٌٍَُِ])", "ال%2%1")
deftr = rsub("al-" .. tr, "^al%-([sšṣtṯṭdḏḍzžẓnrḷ])", "a%1-%1")
end
-- For a given MOD spec, is the previous word (base or modifier) a noun?
-- We assume the base is always a noun in this case, and otherwise
-- look at the value of modN_idafa.
local function prev_mod_is_noun(mod)
if mod == "mod_" then
return true
end
if mod == "mod2_" then
return data["mod_idafa"]
end
modnum = assert_rsub(mod, "^mod([0-9]+)_$", "%1")
modnum = modnum - 1
return data["mod" .. modnum .. "_idafa"]
end
local numgens = ismod and data[mod .. "numgen"] and data.numgens() or {numgen}
-- "defcon" means definite adjective modifying construct state noun. We
-- add a ... before the adjective (and after the construct-state noun) to
-- indicate that a nominal modifier would go between noun and adjective.
local stems = {ind = stem, def = defstem, con = stem,
defcon = "... " .. defstem}
local trs = {ind = tr, def = deftr, con = tr, defcon = "... " .. deftr}
for _, ng in ipairs(numgens) do
for _, state in ipairs(data.allstates) do
for _, case in ipairs(data.allcases_with_lemma) do
-- We are generating the inflections for STATE, but sometimes
-- we want to use the inflected form of a different state, e.g.
-- if modN_state= or basestate= is set to some particular state.
-- If we're dealing with an adjectival modifier, then in
-- place of "con" we use "defcon" if immediately after a noun
-- (see comment above), else "def".
local thestate = ismod and data[mod .. "state"] or
ismod and not data[mod .. "idafa"] and state == "con" and
(prev_mod_is_noun(mod) and "defcon" or "def") or
not ismod and data.basestate or
state
local is_lemmainf = case == "lemma" or case == "inf"
-- Don't substitute value of modcase for lemma/informal "cases"
local thecase = is_lemmainf and case or
ismod and data[mod .. "case"] or case
add_inflection(data, mod .. case .. "_" .. ng .. "_" .. state,
data[mod .. "prefix"] or "",
stems[thestate], trs[thestate],
endings[data.statecases[thestate][thecase]])
end
end
end
end
-- Insert into a category and a type variable (e.g. m_sg_type) for the
-- declension type of a particular declension (e.g. masculine singular for
-- adjectives). MOD and NUMGEN are as in call_inflection(). CATVALUE is the
-- category and ENGVALUE is the English description of the declension type.
-- In these values, NOUN is replaced with either "noun" or "adjective",
-- SINGULAR is replaced with the English equivalent of the number in NUMGEN
-- (e.g. "singular", "dual" or "plural") while BROKSING is the same but uses
-- "broken plural" in place of "plural" and "broken paucal" in place of
-- "paucal".
function insert_cat(data, mod, numgen, catvalue, engvalue)
local singpl = data.engnumbers[rsub(numgen, "^.*_", "")]
assert(singpl ~= nil)
local broksingpl = rsub(singpl, "plural", "broken plural")
broksingpl = rsub(broksingpl, "paucal", "broken paucal")
if rfind(broksingpl, "broken plural") and (rfind(catvalue, "BROKSING") or
rfind(engvalue, "BROKSING")) then
table.insert(data.categories, "Arabic " .. data.pos .. "s with broken plural")
end
if rfind(catvalue, "irregular") or rfind(engvalue, "irregular") then
table.insert(data.categories, "Arabic irregular " .. data.pos .. "s")
end
catvalue = rsub(catvalue, "NOUN", data.pos)
catvalue = rsub(catvalue, "SINGULAR", singpl)
catvalue = rsub(catvalue, "BROKSING", broksingpl)
engvalue = rsub(engvalue, "NOUN", data.pos)
engvalue = rsub(engvalue, "SINGULAR", singpl)
engvalue = rsub(engvalue, "BROKSING", broksingpl)
if mod == "" and catvalue ~= "" then
insert_if_not(data.categories, catvalue)
end
if engvalue ~= "" then
local key = mod .. numgen .. "_type"
if data.forms[key] == nil then
data.forms[key] = {}
end
insert_if_not(data.forms[key], engvalue)
end
if contains(data.states, "def") and not contains(data.states, "ind") then
insert_if_not(data.categories, "Arabic definite " .. data.pos .. "s")
end
end
-- Return true if we're handling modifier inflections and the modifier's
-- case is limited to an oblique case (gen or acc; typically genitive,
-- in an ʾidāfa construction). This is used when returning lemma
-- inflections -- the modifier part of the lemma should agree in case
-- with modifier's case if it's restricted in case.
function mod_oblique(mod, data)
return mod ~= "" and data[mod .. "case"] and (
data[mod .. "case"] == "acc" or data[mod .. "case"] == "gen")
end
-- Similar to mod_oblique but specifically when the modifier case is
-- limited to the accusative (which is rare or nonexistent in practice).
function mod_acc(mod, data)
return mod ~= "" and data[mod .. "case"] and data[mod .. "case"] == "acc"
end
-- Handle triptote and diptote inflections
function triptote_diptote(stem, tr, data, mod, numgen, is_dip, lc)
-- Remove any case ending
if rfind(stem, "[" .. UN .. U .. "]$") then
stem = rsub(stem, "[" .. UN .. U .. "]$", "")
tr = rsub(tr, "un?$", "")
end
-- special-case for صلوة pronounced ṣalāh; check translit
local is_aah = rfind(stem, TAM .. "$") and rfind(tr, "āh$")
if rfind(stem, TAM .. "$") then
if rfind(tr, "h$") then
tr = rsub(tr, "h$", "t")
elseif not rfind(tr, "t$") then
tr = tr .. "t"
end
end
add_inflections(stem, tr, data, mod, numgen,
{is_dip and U or UN,
is_dip and A or AN .. ((rfind(stem, "[" .. HAMZA_ON_ALIF .. TAM .. "]$")
or rfind(stem, "[" .. AMAD .. ALIF .. "]" .. HAMZA .. "$")
) and "" or ALIF),
is_dip and A or IN,
U, A, I,
lc and UU or U,
lc and AA or A,
lc and II or I,
{}, {}, {}, -- omit informal inflections
{}, {}, {}, -- omit lemma inflections
})
-- add category and informal and lemma inflections
local tote = lc and "long construct" or is_dip and "hai cách" or "ba cách"
local singpl_tote = "BROKSING " .. tote
local cat_prefix = "Arabic NOUNs with " .. tote .. " BROKSING"
-- since we're checking translit for -āh we probably don't need to
-- check stem too
if is_aah or rfind(stem, "[" .. AMAD .. ALIF .. "]" .. TAM .. "$") then
add_inflections(stem, rsub(tr, "t$", ""), data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"/t", "/t", "/t", -- informal pron. is -āt
"/h", "/h", "/t", -- lemma uses -āh
})
insert_cat(data, mod, numgen, cat_prefix .. " in -āh",
singpl_tote .. " có đuôi " .. make_link(HYPHEN .. AAH))
elseif rfind(stem, TAM .. "$") then
add_inflections(stem, rsub(tr, "t$", ""), data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "/t",
"", "", "/t",
})
insert_cat(data, mod, numgen, cat_prefix .. " in -a",
singpl_tote .. " có đuôi " .. make_link(HYPHEN .. AH))
elseif lc then
add_inflections(stem, tr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", UU,
"", "", UU,
})
insert_cat(data, mod, numgen, cat_prefix,
singpl_tote)
else
-- also special-case the nisba ending, which has an informal
-- pronunciation.
if rfind(stem, IY .. SH .. "$") then
local infstem = rsub(stem, SH .. "$", "")
local inftr = rsub(tr, "iyy$", "ī")
-- add informal and lemma inflections separately
add_inflections(infstem, inftr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "",
{}, {}, {},
})
add_inflections(stem, tr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "",
})
else
add_inflections(stem, tr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "",
"", "", "",
})
end
insert_cat(data, mod, numgen, "Arabic NOUNs with basic " .. tote .. " BROKSING",
singpl_tote .. " cơ bản")
end
end
-- Regular triptote
inflections["tri"] = function(stem, tr, data, mod, numgen)
triptote_diptote(stem, tr, data, mod, numgen, false)
end
-- Regular diptote
inflections["di"] = function(stem, tr, data, mod, numgen)
triptote_diptote(stem, tr, data, mod, numgen, true)
end
-- Elative and color/defect adjective: usually same as diptote,
-- might be invariable
function elative_color_defect(stem, tr, data, mod, numgen)
if rfind(stem, "[" .. ALIF .. AMAQ .. "]$") then
invariable(stem, tr, data, mod, numgen)
else
triptote_diptote(stem, tr, data, mod, numgen, true)
end
end
-- Elative: usually same as diptote, might be invariable
inflections["el"] = function(stem, tr, data, mod, numgen)
elative_color_defect(stem, tr, data, mod, numgen)
end
-- Color/defect adjective: Same as elative
inflections["cd"] = function(stem, tr, data, mod, numgen)
elative_color_defect(stem, tr, data, mod, numgen)
end
-- Triptote with lengthened ending in the construct state
inflections["lc"] = function(stem, tr, data, mod, numgen)
triptote_diptote(stem, tr, data, mod, numgen, false, true)
end
function in_defective(stem, tr, data, mod, numgen, tri)
if not rfind(stem, IN .. "$") then
error("'in' declension stem should end in -in: '" .. stem .. "'")
end
stem = rsub(stem, IN .. "$", "")
tr = rsub(tr, "in$", "")
local acc_ind_ending = tri and IY .. AN .. ALIF or IY .. A
add_inflections(stem, tr, data, mod, numgen,
{IN, acc_ind_ending, IN,
II, IY .. A, II,
II, IY .. A, II,
II, II, II,
-- FIXME: What should happen with the lemma when modifier case
-- is limited to the accusative and modifier state is e.g. definite?
-- Should the lemma end in -iya or -ī? In practice this will rarely
-- if ever happen.
mod_acc(mod, data) and acc_ind_ending or IN, II, II,
})
local tote = tri and "triptote" or "diptote"
insert_cat(data, mod, numgen, "Arabic NOUNs with " .. tote .. " BROKSING in -in",
"BROKSING " .. tote .. " có đuôi " .. make_link(HYPHEN .. IN))
end
function detect_in_type(stem, ispl)
if ispl and rfind(stem, "^" .. CONS .. AOPT .. CONS .. AOPTA .. CONS .. IN .. "$") then -- layālin
return "diin"
else -- other -in words
return "triin"
end
end
-- Defective in -in
inflections["in"] = function(stem, tr, data, mod, numgen)
in_defective(stem, tr, data, mod, numgen,
detect_in_type(stem, rfind(numgen, "pl")) == "triin")
end
-- Defective in -in, force "triptote" variant
inflections["triin"] = function(stem, tr, data, mod, numgen)
in_defective(stem, tr, data, mod, numgen, true)
end
-- Defective in -in, force "diptote" variant
inflections["diin"] = function(stem, tr, data, mod, numgen)
in_defective(stem, tr, data, mod, numgen, false)
end
-- Defective in -an (comes in two variants, depending on spelling with tall alif or alif maqṣūra)
inflections["an"] = function(stem, tr, data, mod, numgen)
local tall_alif
if rfind(stem, AN .. ALIF .. "$") then
tall_alif = true
stem = rsub(stem, AN .. ALIF .. "$", "")
elseif rfind(stem, AN .. AMAQ .. "$") then
tall_alif = false
stem = rsub(stem, AN .. AMAQ .. "$", "")
else
error("Invalid stem for 'an' declension type: " .. stem)
end
tr = rsub(tr, "an$", "")
if tall_alif then
add_inflections(stem, tr, data, mod, numgen,
{AN .. ALIF, AN .. ALIF, AN .. ALIF,
AA, AA, AA,
AA, AA, AA,
AA, AA, AA,
AN .. ALIF, AA, AA,
})
else
add_inflections(stem, tr, data, mod, numgen,
{AN .. AMAQ, AN .. AMAQ, AN .. AMAQ,
AAMAQ, AAMAQ, AAMAQ,
AAMAQ, AAMAQ, AAMAQ,
AAMAQ, AAMAQ, AAMAQ,
AN .. AMAQ, AAMAQ, AAMAQ,
})
end
-- FIXME: Should we distinguish between tall alif and alif maqṣūra?
insert_cat(data, mod, numgen, "Arabic NOUNs with BROKSING in -an",
"BROKSING in " .. make_link(HYPHEN .. AN .. (tall_alif and ALIF or AMAQ)))
end
function invariable(stem, tr, data, mod, numgen)
add_inflections(stem, tr, data, mod, numgen,
{"", "", "",
"", "", "",
"", "", "",
"", "", "",
"", "", "",
})
insert_cat(data, mod, numgen, "Arabic NOUNs with invariable BROKSING",
"BROKSING invariable")
end
-- Invariable in -ā (non-loanword type)
inflections["inv"] = function(stem, tr, data, mod, numgen)
invariable(stem, tr, data, mod, numgen)
end
-- Invariable in -ā (loanword type, behaving in the dual as if ending in -a, I think!)
inflections["lwinv"] = function(stem, tr, data, mod, numgen)
invariable(stem, tr, data, mod, numgen)
end
-- Duals
inflections["d"] = function(stem, tr, data, mod, numgen)
if rfind(stem, ALIF .. NI .. "?$") then
stem = rsub(stem, AOPTA .. NI .. "?$", "")
elseif rfind(stem, AMAD .. NI .. "?$") then
stem = rsub(stem, AMAD .. NI .. "?$", HAMZA_PH)
else
error("Dual stem should end in -ān(i): '" .. stem .. "'")
end
tr = rsub(tr, "āni?$", "")
local mo = mod_oblique(mod, data)
add_inflections(stem, tr, data, mod, numgen,
{AANI, AYNI, AYNI,
AANI, AYNI, AYNI,
AA, AYSK, AYSK,
AYN, AYN, AYSK,
mo and AYN or AAN, mo and AYN or AAN, mo and AYSK or AA,
})
insert_cat(data, mod, numgen, "", "dual in " .. make_link(HYPHEN .. AANI))
end
-- Sound masculine plural
inflections["smp"] = function(stem, tr, data, mod, numgen)
if not rfind(stem, UUNA .. "?$") then
error("Sound masculine plural stem should end in -ūn(a): '" .. stem .. "'")
end
stem = rsub(stem, UUNA .. "?$", "")
tr = rsub(tr, "ūna?$", "")
local mo = mod_oblique(mod, data)
add_inflections(stem, tr, data, mod, numgen,
{UUNA, IINA, IINA,
UUNA, IINA, IINA,
UU, II, II,
IIN, IIN, II,
mo and IIN or UUN, mo and IIN or UUN, mo and II or UU,
})
-- use SINGULAR because conceivably this might be used with the paucal
-- instead of plural
insert_cat(data, mod, numgen, "Arabic NOUNs with sound masculine SINGULAR",
"sound masculine SINGULAR")
end
-- Sound feminine plural
inflections["sfp"] = function(stem, tr, data, mod, numgen)
if not rfind(stem, "[" .. ALIF .. AMAD .. "]" .. T .. UN .. "?$") then
error("Sound feminine plural stem should end in -āt(un): '" .. stem .. "'")
end
stem = rsub(stem, UN .. "$", "")
tr = rsub(tr, "un$", "")
add_inflections(stem, tr, data, mod, numgen,
{UN, IN, IN,
U, I, I,
U, I, I,
"", "", "",
"", "", "",
})
-- use SINGULAR because this might be used with the paucal
-- instead of plural
insert_cat(data, mod, numgen, "Arabic NOUNs with sound feminine SINGULAR",
"sound feminine SINGULAR")
end
-- Plural of defective in -an
inflections["awnp"] = function(stem, tr, data, mod, numgen)
if not rfind(stem, AWNA .. "?$") then
error("'awnp' plural stem should end in -awn(a): '" .. stem .. "'")
end
stem = rsub(stem, AWNA .. "?$", "")
tr = rsub(tr, "awna?$", "")
local mo = mod_oblique(mod, data)
add_inflections(stem, tr, data, mod, numgen,
{AWNA, AYNA, AYNA,
AWNA, AYNA, AYNA,
AWSK, AYSK, AYSK,
AYN, AYN, AYSK,
mo and AYN or AWN, mo and AYN or AWN, mo and AYSK or AWSK,
})
-- use SINGULAR because conceivably this might be used with the paucal
-- instead of plural
insert_cat(data, mod, numgen, "Arabic NOUNs with sound SINGULAR in -awna",
"sound SINGULAR in " .. make_link(HYPHEN .. AWNA))
end
-- Unknown
inflections["?"] = function(stem, tr, data, mod, numgen)
add_inflections("?", "?", data, mod, numgen,
{"", "", "",
"", "", "",
"", "", "",
"", "", "",
"", "", "",
})
insert_cat(data, mod, numgen, "Arabic NOUNs with unknown SINGULAR",
"SINGULAR unknown")
end
-- Detect declension of noun or adjective stem or lemma. We allow triptotes,
-- diptotes and sound plurals to either come with ʾiʿrāb or not. We detect
-- some cases where vowels are missing, when it seems fairly unambiguous to
-- do so. ISFEM is true if we are dealing with a feminine stem (not
-- currently used and needs to be rethought). NUM is "sg", "du", or "pl",
-- depending on the number of the stem.
--
-- POS is the part of speech, generally "noun" or "adjective". Used to
-- distinguish nouns and adjectives of the فَعْلَان type. There are nouns of
-- this type and they generally are triptotes, e.g. قَطْرَان "tar"
-- and شَيْطَان "devil". An additional complication is that the user can set
-- the POS to something else, like "numeral". We don't use this POS for
-- modifiers, where we determine whether they are noun-like or adjective-like
-- according to whether mod_idafa= is true.
--
-- Some unexpectedly diptote nouns/adjectives:
--
-- jiʿrān in ʾabū jiʿrān "dung beetle"
-- distributive numbers: ṯunāʾ "two at a time", ṯulāṯ/maṯlaṯ "three at a time",
-- rubāʿ "four at a time" (not a regular diptote pattern, cf. triptote
-- junāḥ "misdemeanor, sin", nujār "origin, root", nuḥām "flamingo")
-- jahannam (f.) "hell"
-- many names: jilliq/jillaq "Damascus", judda/jidda "Jedda", jibrīl (and
-- variants) "Gabriel", makka "Mecca", etc.
-- jibriyāʾ "pride"
-- kibriyāʾ "glory, pride"
-- babbaḡāʾ "parrot"
-- ʿayāyāʾ "incapable, tired"
-- suwaidāʾ "black bile, melancholy"
-- Note also: ʾajhar "day-blind" (color-defect) and ʾajhar "louder" (elative)
function export.detect_type(stem, isfem, num, pos)
local function dotrack(word)
track(word)
track(word .. "/" .. pos)
return true
end
-- Not strictly necessary because the caller (stem_and_type) already
-- reorders, but won't hurt, and may be necessary if this function is
-- called from an external caller.
stem = reorder_shadda(stem)
local origstem = stem
-- So that we don't get tripped up by alif madda, we replace alif madda
-- with the sequence hamza + fatḥa + alif before the regexps below.
stem = rsub(stem, AMAD, HAMZA .. AA)
if num == "du" then
if rfind(stem, ALIF .. NI .. "?$") then
return "d"
else
error("Malformed stem for dual, should end in the nominative dual ending -ān(i): '" .. origstem .. "'")
end
end
if rfind(stem, IN .. "$") then -- -in words
return detect_in_type(stem, num == "pl")
elseif rfind(stem, AN .. "[" .. ALIF .. AMAQ .. "]$") then
return "an"
elseif rfind(stem, AN .. "$") then
error("Malformed stem, fatḥatan should be over second-to-last letter: " .. origstem)
elseif num == "pl" and rfind(stem, AW .. SKOPT .. N .. AOPT .. "$") then
return "awnp"
elseif num == "pl" and rfind(stem, ALIF .. T .. UNOPT .. "$") and
-- Avoid getting tripped up by plurals like ʾawqāt "times",
-- ʾaḥwāt "fishes", ʾabyāt "verses", ʾazyāt "oils", ʾaṣwāt "voices",
-- ʾamwāt "dead (pl.)".
not rfind(stem, HAMZA_ON_ALIF .. A .. CONS .. SK .. CONS .. AAT .. UNOPT .. "$") then
return "sfp"
elseif num == "pl" and rfind(stem, W .. N .. AOPT .. "$") and
-- Avoid getting tripped up by plurals like ʿuyūn "eyes",
-- qurūn "horns" (note we check for U between first two consonants
-- so we correctly ignore cases like sinūn "hours" (from sana),
-- riʾūn "lungs" (from riʾa) and banūn "sons" (from ibn).
not rfind(stem, "^" .. CONS .. U .. CONS .. UUN .. AOPT .. "$") then
return "smp"
elseif rfind(stem, UN .. "$") then -- explicitly specified triptotes (we catch sound feminine plurals above)
return "tri"
elseif rfind(stem, U .. "$") then -- explicitly specified diptotes
return "di"
elseif -- num == "pl" and
( -- various diptote plural patterns; these are diptote even in the singular (e.g. yanāyir "January", falāfil "falafel", tuʾabāʾ "yawn, fatigue"
-- currently we sometimes end up with such plural patterns in the "singular" in a singular
-- ʾidāfa construction with plural modifier. (FIXME: These should be fixed to the correct number.)
rfind(stem, "^" .. CONS .. AOPT .. CONS .. AOPTA .. CONS .. IOPT .. Y .. "?" .. CONS .. "$") and dotrack("fawaakih") or -- fawākih, daqāʾiq, makātib, mafātīḥ
rfind(stem, "^" .. CONS .. AOPT .. CONS .. AOPTA .. CONS .. SH .. "$")
and not rfind(stem, "^" .. T) and dotrack("mawaadd") or -- mawādd, maqāmm, ḍawāll; exclude t- so we don't catch form-VI verbal nouns like taḍādd (HACK!!!)
rfind(stem, "^" .. CONS .. U .. CONS .. AOPT .. CONS .. AOPTA .. HAMZA .. "$") and dotrack("wuzaraa") or -- wuzarāʾ "ministers", juhalāʾ "ignorant (pl.)"
rfind(stem, ELCD_START .. SKOPT .. CONS .. IOPT .. CONS .. AOPTA .. HAMZA .. "$") and dotrack("asdiqaa") or -- ʾaṣdiqāʾ
rfind(stem, ELCD_START .. IOPT .. CONS .. SH .. AOPTA .. HAMZA .. "$") and dotrack("aqillaa") -- ʾaqillāʾ, ʾajillāʾ "important (pl.)", ʾaḥibbāʾ "lovers"
) then
return "di"
elseif num == "sg" and ( -- diptote singular patterns (nouns/adjectives)
rfind(stem, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. HAMZA .. "$") and dotrack("qamraa") or -- qamrāʾ "moon-white, moonlight"; baydāʾ "desert"; ṣaḥrāʾ "desert-like, desert"; tayhāʾ "trackless, desolate region"; not pl. to avoid catching e.g. ʾabnāʾ "sons", ʾaḥmāʾ "fathers-in-law", ʾamlāʾ "steppes, deserts" (pl. of malan), ʾanbāʾ "reports" (pl. of nabaʾ)
rfind(stem, ELCD_START .. SK .. CONS .. A .. CONS .. "$") and dotrack("abyad") or -- ʾabyaḍ "white", ʾakbar "greater"; FIXME nouns like ʾaʿzab "bachelor", ʾaḥmad "Ahmed" but not ʾarnab "rabbit", ʾanjar "anchor", ʾabjad "abjad", ʾarbaʿ "four", ʾandar "threshing floor" (cf. diptote ʾandar "rarer")
rfind(stem, ELCD_START .. A .. CONS .. SH .. "$") and dotrack("alaff") or -- ʾalaff "plump", ʾaḥabb "more desirable"
-- do the following on the origstem so we can check specifically for alif madda
rfind(origstem, "^" .. AMAD .. CONS .. A .. CONS .. "$") and dotrack("aalam") -- ʾālam "more painful", ʾāḵar "other"
) then
return "di"
elseif num == "sg" and pos == "adjective" and ( -- diptote singular patterns (adjectives)
rfind(stem, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. N .. "$") and dotrack("kaslaan") or -- kaslān "lazy", ʿaṭšān "thirsty", jawʿān "hungry", ḡaḍbān "angry", tayhān "wandering, perplexed"; but not nouns like qaṭrān "tar", šayṭān "devil", mawtān "plague", maydān "square"
-- rfind(stem, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. N .. "$") and dotrack("laffaa") -- excluded because of too many false positives e.g. ḵawwān "disloyal", not to mention nouns like jannān "gardener"; only diptote example I can find is ʿayyān "incapable, weary" (diptote per Lane but not Wehr)
rfind(stem, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. HAMZA .. "$") and dotrack("laffaa") -- laffāʾ "plump (fem.)"; but not nouns like jarrāʾ "runner", ḥaddāʾ "camel driver", lawwāʾ "wryneck"
) then
return "di"
elseif rfind(stem, AMAQ .. "$") then -- kaslā, ḏikrā (spelled with alif maqṣūra)
return "inv"
elseif rfind(stem, "[" .. ALIF .. SK .. "]" .. Y .. AOPTA .. "$") then -- dunyā, hadāyā (spelled with tall alif after yāʾ)
return "inv"
elseif rfind(stem, ALIF .. "$") then -- kāmērā, lībiyā (spelled with tall alif; we catch dunyā and hadāyā above)
return "lwinv"
elseif rfind(stem, II .. "$") then -- cases like كُوبْرِي kubrī "bridge" and صَوَانِي ṣawānī pl. of ṣīniyya; modern words that would probably end with -in
dotrack("ii")
return "inv"
elseif rfind(stem, UU .. "$") then -- FIXME: Does this occur? Check the tracking
dotrack("uu")
return "inv"
else
return "tri"
end
end
-- Replace hamza (of any sort) at the end of a word, possibly followed by
-- a nominative case ending or -in or -an, with HAMZA_PH, and replace alif
-- madda at the end of a word with HAMZA_PH plus fatḥa + alif. To undo these
-- changes, use hamza_seat().
function canon_hamza(word)
word = rsub(word, AMAD .. "$", HAMZA_PH .. AA)
word = rsub(word, HAMZA_ANY .. "([" .. UN .. U .. IN .. "]?)$", HAMZA_PH .. "%1")
word = rsub(word, HAMZA_ANY .. "(" .. AN .. "[" .. ALIF .. AMAQ .. "])$", HAMZA_PH .. "%1")
return word
end
-- Supply the appropriate hamza seat(s) for a placeholder hamza.
function hamza_seat(word)
if rfind(word, HAMZA_PH) then -- optimization to avoid many regexp substs
return ar_utilities.process_hamza(word)
end
return {word}
end
--[[
-- Supply the appropriate hamza seat for a placeholder hamza in a combined
-- Arabic/translation expression.
function split_and_hamza_seat(word)
if rfind(word, HAMZA_PH) then -- optimization to avoid many regexp substs
local ar, tr = split_arabic_tr(word)
-- FIXME: Do something with all values returned
ar = ar_utilities.process_hamza(ar)[1]
return ar .. "/" .. tr
end
return word
end
--]]
-- Return stem and type of an argument given the singular stem and whether
-- this is a plural argument. WORD may be of the form ARABIC, ARABIC/TR,
-- ARABIC:TYPE, ARABIC/TR:TYPE, or TYPE, for Arabic stem ARABIC with
-- transliteration TR and of type (i.e. declension) TYPE. If the type
-- is omitted, it is auto-detected using detect_type(). If the transliteration
-- is omitted, it is auto-transliterated from the Arabic. If only the type
-- is present, it is a sound plural type ("sf", "sm" or "awn"),
-- in which case the stem and translit are generated from the singular by
-- regular rules. SG may be of the form ARABIC/TR or ARABIC. ISFEM is true
-- if WORD is a feminine stem. NUM is either "sg", "du" or "pl" according to
-- the number of the stem. The return value will be in the ARABIC/TR format.
--
-- POS is the part of speech, generally "noun" or "adjective". Used to
-- distinguish nouns and adjectives of the فَعْلَان type. There are nouns of
-- this type and they generally are triptotes, e.g. قَطْرَان "tar"
-- and شَيْطَان "devil". An additional complication is that the user can set
-- the POS to something else, like "numeral". We don't use this POS for
-- modifiers, where we determine whether they are noun-like or adjective-like
-- according to whether mod_idafa= is true.
function export.stem_and_type(word, sg, sgtype, isfem, num, pos)
local rettype = nil
if rfind(word, ":") then
local split = rsplit(word, ":")
if #split > 2 then
error("More than one colon found in argument: '" .. word .. "'")
end
word, rettype = split[1], split[2]
end
local ar, tr = split_arabic_tr(word)
-- Need to reorder shaddas here so that shadda at the end of a stem
-- followed by ʾiʿrāb or a plural ending or whatever can get processed
-- correctly. This processing happens in various places so make sure
-- we return the reordered Arabic in all circumstances.
ar = reorder_shadda(ar)
local artr = ar .. "/" .. tr
-- Now return split-out ARABIC/TR and TYPE, with shaddas reordered in
-- the Arabic.
if rettype then
return artr, rettype
end
-- Likewise, do shadda reordering for the singular.
local sgar, sgtr = split_arabic_tr(sg)
sgar = reorder_shadda(sgar)
-- Apply a substitution to the singular Arabic and translit. If a
-- substitution could be made, return the combined ARABIC/TR with
-- substitutions made; else, return nil. The Arabic has ARFROM
-- replaced with ARTO, while the translit has TRFROM replaced with
-- TRTO, and if that doesn't match, replace TRFROM2 with TRTO2.
local function sub(arfrom, arto, trfrom, trto, trfrom2, trto2, trfrom3, trto3)
if rfind(sgar, arfrom) then
local arret = rsub(sgar, arfrom, arto)
local trret = sgtr
if rfind(sgtr, trfrom) then
trret = rsub(sgtr, trfrom, trto)
elseif trfrom2 and rfind(sgtr, trfrom2) then
trret = rsub(sgtr, trfrom2, trto2)
elseif trfrom3 and rfind(sgtr, trfrom3) then
trret = rsub(sgtr, trfrom3, trto3)
elseif not rfind(sgtr, BOGUS_CHAR) then
error("Transliteration '" .. sgtr .."' does not have same ending as Arabic '" .. sgar .. "'")
end
return arret .. "/" .. trret
else
return nil
end
end
if (num ~= "sg" or not isfem) and (word == "elf" or word == "cdf" or word == "intf" or word == "rf" or word == "f") then
error("Inference of form for inflection type '" .. word .. "' only allowed in singular feminine")
end
if num ~= "du" and word == "d" then
error("Inference of form for inflection type '" .. word .. "' only allowed in dual")
end
if num ~= "pl" and (word == "sfp" or word == "smp" or word == "awnp" or word == "cdp" or word == "sp" or word == "fp" or word == "p") then
error("Inference of form for inflection type '" .. word .. "' only allowed in plural")
end
local function is_intensive_adj(ar)
return rfind(ar, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. N .. UOPT .. "$") or
rfind(ar, "^" .. CONS .. A .. CONS .. SK .. AMAD .. N .. UOPT .. "$") or
rfind(ar, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. N .. UOPT .. "$")
end
local function is_feminine_cd_adj(ar)
return pos == "adjective" and
(rfind(ar, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. HAMZA .. UOPT .. "$") or -- ʾḥamrāʾ/ʿamyāʾ/bayḍāʾ
rfind(ar, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. HAMZA .. UOPT .. "$") -- laffāʾ
)
end
local function is_elcd_adj(ar)
return rfind(ar, ELCD_START .. SK .. CONS .. A .. CONS .. UOPT .. "$") or -- ʾabyaḍ "white", ʾakbar "greater"
rfind(ar, ELCD_START .. A .. CONS .. SH .. UOPT .. "$") or -- ʾalaff "plump", ʾaqall "fewer"
rfind(ar, ELCD_START .. SK .. CONS .. AAMAQ .. "$") or -- ʾaʿmā "blind", ʾadnā "lower"
rfind(ar, "^" .. AMAD .. CONS .. A .. CONS .. UOPT .. "$") -- ʾālam "more painful", ʾāḵar "other"
end
if word == "?" or
(rfind(word, "^[a-z][a-z]*$") and sgtype == "?") then
--if 'word' is a type, actual value inferred from sg; if sgtype is ?,
--propagate it to all derived types
return "", "?"
end
if word == "intf" then
if not is_intensive_adj(sgar) then
error("Singular stem not in CACCān form: " .. sgar)
end
local ret = (
sub(AMAD .. N .. UOPT .. "$", AMAD, "nu?$", "") or -- ends in -ʾān
sub(AOPTA .. N .. UOPT .. "$", AMAQ, "nu?$", "") -- ends in -ān
)
return ret, "inv"
end
if word == "elf" then
local ret = (
sub(ELCD_START .. SK .. "[" .. Y .. W .. "]" .. A .. CONSPAR .. UOPT .. "$",
"%1" .. UU .. "%2" .. AMAQ, "ʔa(.)[yw]a(.)u?", "%1ū%2ā") or -- ʾajyad
sub(ELCD_START .. SK .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
"%1" .. U .. "%2" .. SK .. "%3" .. AMAQ, "ʔa(.)(.)a(.)u?", "%1u%2%3ā") or -- ʾakbar
sub(ELCD_START .. A .. CONSPAR .. SH .. UOPT .. "$",
"%1" .. U .. "%2" .. SH .. AMAQ, "ʔa(.)a(.)%2u?", "%1u%2%2ā") or -- ʾaqall
sub(ELCD_START .. SK .. CONSPAR .. AAMAQ .. "$",
"%1" .. U .. "%2" .. SK .. Y .. ALIF, "ʔa(.)(.)ā", "%1u%2yā") or -- ʾadnā
sub("^" .. AMAD .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
HAMZA_ON_ALIF .. U .. "%1" .. SK .. "%2" .. AMAQ, "ʔā(.)a(.)u?", "ʔu%1%2ā") -- ʾālam "more painful", ʾāḵar "other"
)
if not ret then
error("Singular stem not an elative adjective: " .. sgar)
end
return ret, "inv"
end
if word == "cdf" then
local ret = (
sub(ELCD_START .. SK .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
"%1" .. A .. "%2" .. SK .. "%3" .. AA .. HAMZA, "ʔa(.)(.)a(.)u?", "%1a%2%3āʔ") or -- ʾaḥmar
sub(ELCD_START .. A .. CONSPAR .. SH .. UOPT .. "$",
"%1" .. A .. "%2" .. SH .. AA .. HAMZA, "ʔa(.)a(.)%2u?", "%1a%2%2āʔ") or -- ʾalaff
sub(ELCD_START .. SK .. CONSPAR .. AAMAQ .. "$",
"%1" .. A .. "%2" .. SK .. Y .. AA .. HAMZA, "ʔa(.)(.)ā", "%1a%2yāʔ") -- ʾaʿmā
)
if not ret then
error("Singular stem not a color/defect adjective: " .. sgar)
end
return ret, "cd" -- so plural will be correct
end
-- Regular feminine -- add ة, possibly with stem modifications
if word == "rf" then
sgar = canon_hamza(sgar)
if rfind(sgar, TAM .. UNUOPT .. "$") then
--Don't do this or we have problems when forming singulative from
--collective with a construct modifier that's feminine
--error("Singular stem is already feminine: " .. sgar)
return sgar .. "/" .. sgtr, "tri"
end
local ret = (
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AAH, "an$", "āh") or -- ends in -an
sub(IN .. "$", IY .. AH, "in$", "iya") or -- ends in -in
sub(AOPT .. "[" .. ALIF .. AMAQ .. "]$", AAH, "ā$", "āh") or -- ends in alif or alif maqṣūra
-- We separate the ʾiʿrāb and no-ʾiʿrāb cases even though we can
-- do a single Arabic regexp to cover both because we want to
-- remove u(n) from the translit only when ʾiʿrāb is present to
-- lessen the risk of removing -un in the actual stem. We also
-- allow for cases where the ʾiʿrāb is present in Arabic but not
-- in translit.
sub(UNU .. "$", AH, "un?$", "a", "$", "a") or -- anything else + -u(n)
sub("$", AH, "$", "a") -- anything else
)
return ret, "tri"
end
if word == "f" then
if sgtype == "cd" then
return export.stem_and_type("cdf", sg, sgtype, true, "sg", pos)
elseif sgtype == "el" then
return export.stem_and_type("elf", sg, sgtype, true, "sg", pos)
elseif sgtype =="di" and is_intensive_adj(sgar) then
return export.stem_and_type("intf", sg, sgtype, true, "sg", pos)
elseif sgtype == "di" and is_elcd_adj(sgar) then
-- If form is elative or color-defect, we don't know which of
-- the two it is, and each has a special feminine which isn't
-- the regular "just add ة", so shunt to unknown. This will
-- ensure that ?'s appear in place of the inflection -- also
-- for dual and plural.
return export.stem_and_type("?", sg, sgtype, true, "sg", pos)
else
return export.stem_and_type("rf", sg, sgtype, true, "sg", pos)
end
end
if word == "rm" then
sgar = canon_hamza(sgar)
--Don't do this or we have problems when forming collective from
--singulative with a construct modifier that's not feminine,
--e.g. شَجَرَة التُفَّاح
--if not rfind(sgar, TAM .. UNUOPT .. "$") then
-- error("Singular stem is not feminine: " .. sgar)
--end
local ret = (
sub(AAH .. UNUOPT .. "$", AN .. AMAQ, "ātun?$", "an", "ā[ht]$", "an") or -- in -āh
sub(IY .. AH .. UNUOPT .. "$", IN, "iyatun?$", "in", "iya$", "in") or -- ends in -iya
sub(AOPT .. TAM .. UNUOPT .. "$", "", "atun?$", "", "a$", "") or --ends in -a
sub("$", "", "$", "") -- do nothing
)
return ret, "tri"
end
if word == "m" then
-- FIXME: handle cd (color-defect)
-- FIXME: handle el (elative)
-- FIXME: handle int (intensive)
return export.stem_and_type("rm", sg, sgtype, false, "sg", pos)
end
-- The plural used for feminine adjectives. If the singular type is
-- color/defect or it looks like a feminine color/defect adjective,
-- use color/defect plural. Otherwise shunt to sound feminine plural.
if word == "fp" then
if sgtype == "cd" or is_feminine_cd_adj(sgar) then
return export.stem_and_type("cdp", sg, sgtype, true, "pl", pos)
else
return export.stem_and_type("sfp", sg, sgtype, true, "pl", pos)
end
end
if word == "sp" then
if sgtype == "cd" then
return export.stem_and_type("cdp", sg, sgtype, isfem, "pl", pos)
elseif isfem then
return export.stem_and_type("sfp", sg, sgtype, true, "pl", pos)
elseif sgtype == "an" then
return export.stem_and_type("awnp", sg, sgtype, false, "pl", pos)
else
return export.stem_and_type("smp", sg, sgtype, false, "pl", pos)
end
end
-- Conservative plural, as used for masculine plural adjectives.
-- If singular type is color-defect, shunt to color-defect plural; else
-- shunt to unknown, so ? appears in place of the inflections.
if word == "p" then
if sgtype == "cd" then
return export.stem_and_type("cdp", sg, sgtype, isfem, "pl", pos)
else
return export.stem_and_type("?", sg, sgtype, isfem, "pl", pos)
end
end
-- Special plural used for paucal plurals of singulatives. If ends in -ة
-- (most common), use strong feminine plural; if ends with -iyy (next
-- most common), use strong masculine plural; ends default to "p"
-- (conservative plural).
if word == "paucp" then
if rfind(sgar, TAM .. UNUOPT .. "$") then
return export.stem_and_type("sfp", sg, sgtype, true, "pl", pos)
elseif rfind(sgar, IY .. SH .. UNUOPT .. "$") then
return export.stem_and_type("smp", sg, sgtype, false, "pl", pos)
else
return export.stem_and_type("p", sg, sgtype, isfem, "pl", pos)
end
end
if word == "d" then
sgar = canon_hamza(sgar)
local ret = (
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AY .. AAN, "an$", "ayān") or -- ends in -an
sub(IN .. "$", IY .. AAN, "in$", "iyān") or -- ends in -in
sgtype == "lwinv" and sub(AOPTA .. "$", AT .. AAN, "[āa]$", "atān") or -- lwinv, ends in alif; allow translit with short -a
sub(AOPT .. "[" .. ALIF .. AMAQ .. "]$", AY .. AAN, "ā$", "ayān") or -- ends in alif or alif maqṣūra
-- We separate the ʾiʿrāb and no-ʾiʿrāb cases even though we can
-- do a single Arabic regexp to cover both because we want to
-- remove u(n) from the translit only when ʾiʿrāb is present to
-- lessen the risk of removing -un in the actual stem. We also
-- allow for cases where the ʾiʿrāb is present in Arabic but not
-- in translit.
--
-- NOTE: Collapsing the "h$" and "$" cases into "h?$" doesn't work
-- in the case of words ending in -āh, which end up having the
-- translit end in -tāntān.
sub(TAM .. UNU .. "$", T .. AAN, "[ht]un?$", "tān", "h$", "tān", "$", "tān") or -- ends in tāʾ marbuṭa + -u(n)
sub(TAM .. "$", T .. AAN, "h$", "tān", "$", "tān") or -- ends in tāʾ marbuṭa
-- Same here as above
sub(UNU .. "$", AAN, "un?$", "ān", "$", "ān") or -- anything else + -u(n)
sub("$", AAN, "$", "ān") -- anything else
)
return ret, "d"
end
-- Strong feminine plural in -āt, possibly with stem modifications
if word == "sfp" then
sgar = canon_hamza(sgar)
sgar = rsub(sgar, AMAD .. "(" .. TAM .. UNUOPT .. ")$", HAMZA_PH .. AA .. "%1")
sgar = rsub(sgar, HAMZA_ANY .. "(" .. AOPT .. TAM .. UNUOPT .. ")$", HAMZA_PH .. "%1")
local ret = (
sub(AOPTA .. TAM .. UNUOPT .. "$", AYAAT, "ā[ht]$", "ayāt", "ātun?$", "ayāt") or -- ends in -āh
sub(AOPT .. TAM .. UNUOPT .. "$", AAT, "a$", "āt", "atun?$", "āt") or -- ends in -a
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AYAAT, "an$", "ayāt") or -- ends in -an
sub(IN .. "$", IY .. AAT, "in$", "iyāt") or -- ends in -in
sgtype == "inv" and (
sub(AOPT .. "[" .. ALIF .. AMAQ .. "]$", AYAAT, "ā$", "ayāt") -- ends in alif or alif maqṣūra
) or
sgtype == "lwinv" and (
sub(AOPTA .. "$", AAT, "[āa]$", "āt") -- loanword ending in tall alif; allow translit with short -a
) or
-- We separate the ʾiʿrāb and no-ʾiʿrāb cases even though we can
-- do a single Arabic regexp to cover both because we want to
-- remove u(n) from the translit only when ʾiʿrāb is present to
-- lessen the risk of removing -un in the actual stem. We also
-- allow for cases where the ʾiʿrāb is present in Arabic but not
-- in translit.
sub(UNU .. "$", AAT, "un?$", "āt", "$", "āt") or -- anything else + -u(n)
sub("$", AAT, "$", "āt") -- anything else
)
return ret, "sfp"
end
if word == "smp" then
sgar = canon_hamza(sgar)
local ret = (
sub(IN .. "$", UUN, "in$", "ūn") or -- ends in -in
-- See comments above for why we have two cases, one for UNU and
-- one for non-UNU
sub(UNU .. "$", UUN, "un?$", "ūn", "$", "ūn") or -- anything else + -u(n)
sub("$", UUN, "$", "ūn") -- anything else
)
return ret, "smp"
end
-- Color/defect plural; singular must be masculine or feminine
-- color/defect adjective
if word == "cdp" then
local ret = (
sub(ELCD_START .. SK .. W .. A .. CONSPAR .. UOPT .. "$",
"%1" .. UU .. "%2", "ʔa(.)wa(.)u?", "%1ū%2") or -- ʾaswad
sub(ELCD_START .. SK .. Y .. A .. CONSPAR .. UOPT .. "$",
"%1" .. II .. "%2", "ʔa(.)ya(.)u?", "%1ī%2") or -- ʾabyaḍ
sub(ELCD_START .. SK .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
"%1" .. U .. "%2" .. SK .. "%3", "ʔa(.)(.)a(.)u?", "%1u%2%3") or -- ʾaḥmar
sub(ELCD_START .. A .. CONSPAR .. SH .. UOPT .. "$",
"%1" .. U .. "%2" .. SH, "ʔa(.)a(.)%2u?", "%1u%2%2") or -- ʾalaff
sub(ELCD_START .. SK .. CONSPAR .. AAMAQ .. "$",
"%1" .. U .. "%2" .. Y, "ʔa(.)(.)ā", "%1u%2y") or -- ʾaʿmā
sub("^" .. CONSPAR .. A .. W .. SKOPT .. CONSPAR .. AA .. HAMZA .. UOPT .. "$", "%1" .. UU .. "%2", "(.)aw(.)āʔu?", "%1ū%2") or -- sawdāʾ
sub("^" .. CONSPAR .. A .. Y .. SKOPT .. CONSPAR .. AA .. HAMZA .. UOPT .. "$", "%1" .. II .. "%2", "(.)ay(.)āʔu?", "%1ī%2") or -- bayḍāʾ
sub("^" .. CONSPAR .. A .. CONSPAR .. SK .. CONSPAR .. AA .. HAMZA .. UOPT .. "$", "%1" .. U .. "%2" .. SK .. "%3", "(.)a(.)(.)āʔu?", "%1u%2%3") or -- ʾḥamrāʾ/ʿamyāʾ
sub("^" .. CONSPAR .. A .. CONSPAR .. SH .. AA .. HAMZA .. UOPT .. "$", "%1" .. U .. "%2" .. SH, "(.)a(.)%2āʔu?", "%1u%2%2") -- laffāʾ
)
if not ret then
error("For 'cdp', singular must be masculine or feminine color/defect adjective: " .. sgar)
end
return ret, "tri"
end
if word == "awnp" then
local ret = (
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AWSK .. N, "an$", "awn") -- ends in -an
)
if not ret then
error("For 'awnp', singular must end in -an: " .. sgar)
end
return ret, "awnp"
end
return artr, export.detect_type(ar, isfem, num, pos)
end
-- local outersep = " <small style=\"color: #888\">or</small> "
-- need LRM here so multiple Arabic plurals end up agreeing in order with
-- the transliteration
local outersep = LRM .. "; "
local innersep = LRM .. "/"
-- Subfunction of show_form(), used to implement recursively generating
-- all combinations of elements from FORM and from each of the items in
-- LIST_OF_MODS, both of which are either arrays of strings or arrays of
-- arrays of strings, where the strings are in the form ARABIC/TRANSLIT,
-- as described in show_form(). TRAILING_ARTRMODS is an array of ARTRMOD
-- items, each of which is a two-element array of ARMOD (Arabic) and TRMOD
-- (transliteration), accumulating all of the suffixes generated so far
-- in the recursion process. Each time we recur we take the last MOD item
-- off of LIST_OF_MODS, separate each element in MOD into its Arabic and
-- Latin parts and to each Arabic/Latin pair we add all elements in
-- TRAILING_ARTRMODS, passing the newly generated list of ARTRMOD items
-- down the next recursion level with the shorter LIST_OF_MODS. We end up
-- returning a string to insert into the Wiki-markup table.
function show_form_1(form, list_of_mods, trailing_artrmods, use_parens)
if #list_of_mods == 0 then
local arabicvals = {}
local latinvals = {}
local parenvals = {}
-- Accumulate separately the Arabic and transliteration into
-- ARABICVALS and LATINVALS, then concatenate each down below.
-- However, if USE_PARENS, we put each transliteration directly
-- after the corresponding Arabic, in parens, and put the results
-- in PARENVALS, which get concatenated below. (This is used in the
-- title of the declension table.)
for _, artrmod in ipairs(trailing_artrmods) do
assert(#artrmod == 2)
local armod = artrmod[1]
local trmod = artrmod[2]
for _, subform in ipairs(form) do
local ar_span, tr_span
local ar_subspan, tr_subspan
local ar_subspans = {}
local tr_subspans = {}
if type(subform) ~= "table" then
subform = {subform}
end
for _, subsubform in ipairs(subform) do
local arabic, translit = split_arabic_tr(subsubform)
if arabic == "-" then
ar_subspan = "—"
tr_subspan = "—"
else
tr_subspan = (rfind(translit, BOGUS_CHAR) or rfind(trmod, BOGUS_CHAR)) and "?" or
require("Module:script utilities").tag_translit(translit .. trmod, lang, "default", 'style="color: #888;"')
-- implement elision of al- after vowel
tr_subspan = rsub(tr_subspan, "([aeiouāēīōū][ %-])a([sšṣtṯṭdḏḍzžẓnrḷl]%-)", "%1%2")
tr_subspan = rsub(tr_subspan, "([aeiouāēīōū][ %-])a(llāh)", "%1%2")
ar_subspan = m_links.full_link({lang = lang, term = arabic .. armod, tr = "-"})
end
insert_if_not(ar_subspans, ar_subspan)
insert_if_not(tr_subspans, tr_subspan)
end
ar_span = table.concat(ar_subspans, innersep)
tr_span = table.concat(tr_subspans, innersep)
if use_parens then
table.insert(parenvals, ar_span .. " (" .. tr_span .. ")")
else
table.insert(arabicvals, ar_span)
table.insert(latinvals, tr_span)
end
end
end
if use_parens then
return table.concat(parenvals, outersep)
else
local arabic_span = table.concat(arabicvals, outersep)
local latin_span = table.concat(latinvals, outersep)
return arabic_span .. "<br />" .. latin_span
end
else
local last_mods = table.remove(list_of_mods)
local artrmods = {}
for _, mod in ipairs(last_mods) do
if type(mod) ~= "table" then
mod = {mod}
end
for _, submod in ipairs(mod) do
local armod, trmod = split_arabic_tr(submod)
-- If the value is -, we need to create a blank entry
-- rather than skipping it; if we have no entries at any
-- level, then there will be no overall entries at all
-- because the inside of the loop at the next level will
-- never be executed.
if armod == "-" then
armod = ""
trmod = ""
end
if armod ~= "" then armod = ' ' .. armod end
if trmod ~= "" then trmod = ' ' .. trmod end
for _, trailing_artrmod in ipairs(trailing_artrmods) do
local trailing_armod = trailing_artrmod[1]
local trailing_trmod = trailing_artrmod[2]
armod = armod .. trailing_armod
trmod = trmod .. trailing_trmod
artrmod = {armod, trmod}
table.insert(artrmods, artrmod)
end
end
end
return show_form_1(form, list_of_mods, artrmods, use_parens)
end
end
-- Generate a string to substitute into a particular form in a Wiki-markup
-- table. FORM is the set of inflected forms corresponding to the base,
-- either an array of strings (referring e.g. to different possible plurals)
-- or an array of arrays of strings (the first level referring e.g. to
-- different possible plurals and the inner level referring typically to
-- hamza-spelling variants). LIST_OF_MODS is an array of MODS elements, one
-- per modifier. Each MODS element is the set of inflected forms corresponding
-- to the modifier and is of the same form as FORM, i.e. an array of strings
-- or an array of arrays of strings. Each string is typically of the form
-- "ARABIC/TRANSLIT", i.e. an Arabic string and a Latin string separated
-- by a slash. We loop over all possible combinations of elements from
-- each array; this requires recursion.
function show_form(form, list_of_mods, use_parens)
if not form then
return "—"
elseif type(form) ~= "table" then
error("a non-table value was given in the list of inflected forms.")
end
if #form == 0 then
return "—"
end
-- We need to start the recursion with the third parameter containing
-- one blank element rather than no elements, otherwise no elements
-- will be propagated to the next recursion level.
return show_form_1(form, list_of_mods, {{"", ""}}, use_parens)
end
-- Create a Wiki-markup table using the values in DATA and the template in
-- WIKICODE.
function make_table(data, wikicode)
-- Function used as replace arg of call to rsub(). Replace the
-- specified param with its (HTML) value. The param references appear
-- as {{{PARAM}}} in the wikicode.
local function repl(param)
if param == "pos" then
return data.pos
elseif param == "info" then
return data.title and " (" .. data.title .. ")" or ""
elseif rfind(param, "type$") then
return table.concat(data.forms[param] or {"—"}, outersep)
else
local list_of_mods = {}
for _, mod in ipairs(mod_list) do
local mods = data.forms[mod .. "_" .. param]
if not mods or #mods == 0 then
-- We need one blank element rather than no element,
-- otherwise no elements will be propagated from one
-- recursion level to the next.
mods = {""}
end
table.insert(list_of_mods, mods)
end
return show_form(data.forms[param], list_of_mods, param == "lemma")
end
end
-- For states not in the list of those to be displayed, clear out the
-- corresponding inflections so they appear as a dash.
for _, state in ipairs(data.allstates) do
if not contains(data.states, state) then
for _, numgen in ipairs(data.numgens()) do
for _, case in ipairs(data.allcases) do
data.forms[case .. "_" .. numgen .. "_" .. state] = {}
end
end
end
end
return rsub(wikicode, "{{{([a-z_]+)}}}", repl) .. m_utilities.format_categories(data.categories, lang)
end
-- Generate part of the noun table for a given number spec NUM (e.g. sg)
function generate_noun_num(num)
return [=[! style="background: #CDCDCD;" | Bất định
! style="background: #CDCDCD;" | Xác định
! style="background: #CDCDCD;" | [[construct|cons.]]
|-
! style="background: #EFEFEF;" | Không chính thức
| {{{inf_]=] .. num .. [=[_ind}}}
| {{{inf_]=] .. num .. [=[_def}}}
| {{{inf_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | [[nominative|nom.]]
| {{{nom_]=] .. num .. [=[_ind}}}
| {{{nom_]=] .. num .. [=[_def}}}
| {{{nom_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | [[accusative|acc.]]
| {{{acc_]=] .. num .. [=[_ind}}}
| {{{acc_]=] .. num .. [=[_def}}}
| {{{acc_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | [[genitive|gen.]]
| {{{gen_]=] .. num .. [=[_ind}}}
| {{{gen_]=] .. num .. [=[_def}}}
| {{{gen_]=] .. num .. [=[_con}}}
]=]
end
-- Make the noun table
function make_noun_table(data)
local wikicode = [=[<div class="NavFrame">
<div class="NavHead">Biến cách của {{{pos}}} {{{lemma}}}</div>
<div class="NavContent">
{| class="inflection-table" style="border-width: 1px; border-collapse: collapse; background:#F9F9F9; text-align:center; width:100%;"
]=]
for _, num in ipairs(data.numbers) do
if num == "du" then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" | Số kép
]=] .. generate_noun_num("du")
else
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=2 | ]=] .. data.engnumberscap[num] .. "\n" .. [=[
! style="background: #CDCDCD;" colspan=3 | {{{]=] .. num .. [=[_type}}}
|-
]=] .. generate_noun_num(num)
end
end
wikicode = wikicode .. [=[|}
</div>
</div>]=]
return make_table(data, wikicode)
end
-- Generate part of the gendered-noun table for a given numgen spec
-- NUM (e.g. m_sg)
function generate_gendered_noun_num(num)
return [=[|-
! style="background: #CDCDCD;" | Bất định
! style="background: #CDCDCD;" | Xác định
! style="background: #CDCDCD;" | [[construct|cons.]]
! style="background: #CDCDCD;" | Bất định
! style="background: #CDCDCD;" | Xác định
! style="background: #CDCDCD;" | [[construct|cons.]]
|-
! style="background: #EFEFEF;" | Không chính thức
| {{{inf_m_]=] .. num .. [=[_ind}}}
| {{{inf_m_]=] .. num .. [=[_def}}}
| {{{inf_m_]=] .. num .. [=[_con}}}
| {{{inf_f_]=] .. num .. [=[_ind}}}
| {{{inf_f_]=] .. num .. [=[_def}}}
| {{{inf_f_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | [[nominative|nom.]]
| {{{nom_m_]=] .. num .. [=[_ind}}}
| {{{nom_m_]=] .. num .. [=[_def}}}
| {{{nom_m_]=] .. num .. [=[_con}}}
| {{{nom_f_]=] .. num .. [=[_ind}}}
| {{{nom_f_]=] .. num .. [=[_def}}}
| {{{nom_f_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | [[accusative|acc.]]
| {{{acc_m_]=] .. num .. [=[_ind}}}
| {{{acc_m_]=] .. num .. [=[_def}}}
| {{{acc_m_]=] .. num .. [=[_con}}}
| {{{acc_f_]=] .. num .. [=[_ind}}}
| {{{acc_f_]=] .. num .. [=[_def}}}
| {{{acc_f_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | [[genitive|gen.]]
| {{{gen_m_]=] .. num .. [=[_ind}}}
| {{{gen_m_]=] .. num .. [=[_def}}}
| {{{gen_m_]=] .. num .. [=[_con}}}
| {{{gen_f_]=] .. num .. [=[_ind}}}
| {{{gen_f_]=] .. num .. [=[_def}}}
| {{{gen_f_]=] .. num .. [=[_con}}}
]=]
end
-- Make the gendered noun table
function make_gendered_noun_table(data)
local wikicode = [=[<div class="NavFrame">
<div class="NavHead">Biến cách của {{{pos}}} {{{lemma}}}</div>
<div class="NavContent">
{| class="inflection-table" style="border-width: 1px; border-collapse: collapse; background:#F9F9F9; text-align:center; width:100%;"
]=]
for _, num in ipairs(data.numbers) do
if num == "du" then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=2 | Số kép
! style="background: #CDCDCD;" colspan=3 | Giống đực
! style="background: #CDCDCD;" colspan=3 | Giống cái
]=] .. generate_gendered_noun_num("du")
else
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=3 | ]=] .. data.engnumberscap[num] .. "\n" .. [=[
! style="background: #CDCDCD;" colspan=3 | Giống đực
! style="background: #CDCDCD;" colspan=3 | Giống cái
|-
! style="background: #CDCDCD;" colspan=3 | {{{m_]=] .. num .. [=[_type}}}
! style="background: #CDCDCD;" colspan=3 | {{{f_]=] .. num .. [=[_type}}}
]=] .. generate_gendered_noun_num(num)
end
end
wikicode = wikicode .. [=[|}
</div>
</div>]=]
return make_table(data, wikicode)
end
-- Generate part of the adjective table for a given numgen spec NUM (e.g. m_sg)
function generate_adj_num(num)
return [=[|-
! style="background: #CDCDCD;" | Bất định
! style="background: #CDCDCD;" | Xác định
! style="background: #CDCDCD;" | Bất định
! style="background: #CDCDCD;" | Xác định
|-
! style="background: #EFEFEF;" | Thông tục
| {{{inf_m_]=] .. num .. [=[_ind}}}
| {{{inf_m_]=] .. num .. [=[_def}}}
| {{{inf_f_]=] .. num .. [=[_ind}}}
| {{{inf_f_]=] .. num .. [=[_def}}}
|-
! style="background: #EFEFEF;" | [[nominative|nom.]]
| {{{nom_m_]=] .. num .. [=[_ind}}}
| {{{nom_m_]=] .. num .. [=[_def}}}
| {{{nom_f_]=] .. num .. [=[_ind}}}
| {{{nom_f_]=] .. num .. [=[_def}}}
|-
! style="background: #EFEFEF;" | [[accusative|acc.]]
| {{{acc_m_]=] .. num .. [=[_ind}}}
| {{{acc_m_]=] .. num .. [=[_def}}}
| {{{acc_f_]=] .. num .. [=[_ind}}}
| {{{acc_f_]=] .. num .. [=[_def}}}
|-
! style="background: #EFEFEF;" | [[genitive|gen.]]
| {{{gen_m_]=] .. num .. [=[_ind}}}
| {{{gen_m_]=] .. num .. [=[_def}}}
| {{{gen_f_]=] .. num .. [=[_ind}}}
| {{{gen_f_]=] .. num .. [=[_def}}}
]=]
end
-- Make the adjective table
function make_adj_table(data)
local wikicode = [=[<div class="NavFrame">
<div class="NavHead">Biến cách của {{{pos}}} {{{lemma}}}</div>
<div class="NavContent">
{| class="inflection-table" style="border-width: 1px; border-collapse: collapse; background:#F9F9F9; text-align:center; width:100%;"
]=]
if contains(data.numbers, "sg") then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=3 | Số ít
! style="background: #CDCDCD;" colspan=2 | Giống đực
! style="background: #CDCDCD;" colspan=2 | Giống cái
|-
! style="background: #CDCDCD;" colspan=2 | {{{m_sg_type}}}
! style="background: #CDCDCD;" colspan=2 | {{{f_sg_type}}}
]=] .. generate_adj_num("sg")
end
if contains(data.numbers, "du") then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=2 | Số kép
! style="background: #CDCDCD;" colspan=2 | Giống đực
! style="background: #CDCDCD;" colspan=2 | Giống cái
]=] .. generate_adj_num("du")
end
if contains(data.numbers, "pl") then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=3 | Số nhiều
! style="background: #CDCDCD;" colspan=2 | Giống đực
! style="background: #CDCDCD;" colspan=2 | Giống cái
|-
! style="background: #CDCDCD;" colspan=2 | {{{m_pl_type}}}
! style="background: #CDCDCD;" colspan=2 | {{{f_pl_type}}}
]=] .. generate_adj_num("pl")
end
wikicode = wikicode .. [=[|}
</div>
</div>]=]
return make_table(data, wikicode)
end
return export
-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet: