Module:links

Documentation for this module may be created at Module:links/doc
local export = {}

--[=[
	[[Unsupported titles]], pages with high memory usage,
	extraction modules and part-of-speech names are listed
	at [[Module:links/data]].

	Other modules used:
		[[Module:script utilities]]
		[[Module:scripts]]
		[[Module:languages]] and its submodules
		[[Module:gender and number]]
		[[Module:debug]]
]=]

-- These are prefixed with u to avoid confusion with the default string methods
-- of the same name.
local toNFC = mw.ustring.toNFC
local uchar = mw.ustring.char
local ufind = mw.ustring.find
local ugsub = mw.ustring.gsub
local usub = mw.ustring.sub

local get_entities = require("Module:utilities").get_entities

local table_insert = table.insert
local table_concat = table.concat

local ignore_cap
local pos_tags
function export.getLinkPage(target, lang, sc)
	
	-- Remove diacritics from the page name
	target = lang:makeEntryName(target, sc)

	-- If the link contains unexpanded template parameters, then don't create a link.
	if target:find("{{{") then
		return nil
	end

	if target:sub(1, 1) == "/" then
		return ":" .. target

	-- Link to appendix for reconstructed terms and terms in appendix-only languages
	elseif target:sub(1, 1) == "*" and #target > 1 then
		if lang:getCode() == "und" then
			return nil
		end

		target = "Reconstruction:" .. lang:getCanonicalName() .. "/" .. usub(target, 2)
	elseif lang:getType() == "reconstructed" then
		error("The specified language " .. lang:getCanonicalName()
			.. " is unattested, while the given word is not marked with '*' to indicate that it is reconstructed")
	elseif lang:getType() == "appendix-constructed" then
		target = "Appendix:" .. lang:getCanonicalName() .. "/" .. target
	end

	return target
end

-- Make a language-specific link from given link's parts
local function makeLangLink(link, lang, sc, id, allow_self_link)
	
	-- Find fragments (when link didn't come from parseLink).
	-- Prevents {{l|en|word#Etymology 2|word}} from linking to [[word#Etymology 2#English]].
	if link.fragment == nil then
		-- Replace numeric character references with the corresponding character (&#29; → '),
		-- as they contain #, which causes the numeric character reference to be
		-- misparsed (wa'a → wa&#29;a → pagename wa&, fragment 29;a).
		link.target = link.target:find("&#[^;]+;") and get_entities(link.target) or link.target
		local first, second = link.target:match("^([^#]+)#(.+)$")
		if first then
			link.target, link.fragment = first, second
		end
	end

	-- If there is no display form, then create a default one.
	if not link.display then link.display = link.target end
	
	-- Process the target
	link.target = export.getLinkPage(link.target, lang, sc)
	
	-- Process the display form.
	link.display = lang:makeDisplayText(link.display, sc)

	if not link.target then
		return link.display
	end

	-- If the target is the same as the current page and there is no sense id
	-- and linking to the same page hasn't been turned on, then return a "self-link"
	-- like the software does.
	local selfCheckTarget = get_entities(mw.uri.decode(link.target):gsub("^:", ""))
	local selfCheckCurrentTitle = mw.title.getCurrentTitle().prefixedText
	if selfCheckTarget:find("[#%%&+/:<=>?@[\\%]_{|}]") then
		selfCheckTarget = lang:makeEntryName(selfCheckTarget)
		selfCheckCurrentTitle = lang:makeEntryName(selfCheckCurrentTitle)
	end
	if not (allow_self_link or id) and selfCheckTarget == selfCheckCurrentTitle then
		return "<strong class=\"selflink\">" .. link.display .. "</strong>"
	end

	--[[
		Add fragment
		Do not add a section link to "Undetermined", as such sections do not exist and are invalid.
		TabbedLanguages handles links without a section by linking to the "last visited" section,
		but adding "Undetermined" would break that feature.
		For localized prefixes that make syntax error, please use the format: ["xyz"] = true,
	]]
	local prefix = link.target:match("^:?([^:]+):")
	local prefixes = {
		w = true,
		wikipedia = true,
		Category = true,
	}

	if not prefixes[prefix] then
		if link.fragment or link.target:find("#$") then
			require("Module:debug/track") {
				"links/fragment",
				"links/fragment/" .. lang:getCode()
			}
		end

		if not link.fragment and lang:getCode() ~= "und" then
			if id then
				link.fragment = require("Module:senseid").anchor(lang, id)
			elseif not ufind(link.target, "^Appendix:")
					and not ufind(link.target, "^Reconstruction:") then
				link.fragment = lang:getCanonicalName()
			end
		end
	end

	return "[[" .. link.target .. (link.fragment and "#" .. link.fragment or "") .. "|" .. link.display .. "]]"
end


-- Split a link into its parts
local function parseLink(linktext)
	local link = { target = linktext }
	local first, second = link.target:match("^([^|]+)|(.+)$")
	
	-- Prevent characters whose HTML entities are unsupported titles from being incorrectly recognised as the entity if they are in a link being re-parsed (e.g. "&" becomes "&amp;" when returned, but "&amp;" is also an unsupported title. If "&" is given as a link which is then re-parsed, we don't want it to be perceived as "&amp;".)
	if link.target:match("&[^;]+;") then
		local unsupported_titles = mw.loadData("Module:links/data").unsupported_titles
		if unsupported_titles[second] and unsupported_titles[second] ~= first then
			link.target = get_entities(link.target)
			first, second = link.target:match("^([^|]+)|(.+)$")
		end
	end

	if first then
		link.target = first
		link.display = second
	else
		link.display = link.target
	end

	first, second = link.target:match("^(.+)#(.+)$")

	if first then
		link.target = first
		link.fragment = second
	else
		-- So that makeLangLink does not look for a fragment again
		link.fragment = false
	end

	return link
end


-- Creates a basic wikilink to the given term. If the text already contains
-- links, these are replaced with links to the correct section.
function export.language_link(data, allow_self_link)
	if type(data) ~= "table" then
		error("The first argument to the function language_link must be a table. See Module:links/documentation for more information.")
	end

	local text = data.term

	ignore_cap = ignore_cap or mw.loadData("Module:links/data").ignore_cap
	if ignore_cap[data.lang:getCode()] and text then
		text = text:gsub("%^", "")
	end

	-- If the text begins with * and another character,
	-- then act as if each link begins with *
	local allReconstructed = false

	if text:find("^*.") then
		allReconstructed = true
	end

	-- Do we have embedded wikilinks?
	if text:find("[[", nil, true) then
		--[=[
		[[Special:WhatLinksHere/Template:tracking/links/alt-ignored]]
		[[Special:WhatLinksHere/Template:tracking/links/id-ignored]]
		]=]

		if data.alt then
			require("Module:debug/track")("links/alt-ignored")
			mw.log("(from Module:links)", "text with embedded wikilinks:", text,
				"ignored alt:", data.alt, "lang:", data.lang:getCode())
		end

		if data.id then
			require("Module:debug/track")("links/id-ignored")
			mw.log("(from Module:links)", "text with embedded wikilinks:", text,
				"ignored id:", data.id, "lang:", data.lang:getCode())
		end

		-- Begins and ends with a wikilink tag
		if text:find("^%[%[(.+)%]%]$") then
			-- There are no [ ] in between.
			-- This makes the wikilink tag redundant.
			if text:find("^%[%[[^%[%]]+%]%]$") then
				require("Module:debug/track")("links/redundant wikilink")
			else
				local temp = text:gsub("^%[%[(.+)%]%]$", "%1")
				temp = temp:gsub("%]%], %[%[", "|")

				if not temp:find("[%[%]]") then
					require("Module:debug/track")("links/list")
				end
			end
		end

		text = text:gsub("%[%[([^%]]+)%]%]",
			function(linktext)
				local link = parseLink(linktext)

				if allReconstructed then
					link.target = "*" .. link.target
				end

				return makeLangLink(link, data.lang, data.sc, data.id, allow_self_link)
			end)

		-- Remove the extra * at the beginning if it's immediately followed
		-- by a link whose display begins with * too
		if allReconstructed then
			text = text:gsub("^%*%[%[([^|%]]+)|%*", "[[%1|*")
		end
	else
		-- There is no embedded wikilink, make a link using the parameters.
		text = makeLangLink({ target = text, display = data.alt }, data.lang, data.sc, data.id, allow_self_link)
	end

	return text
end

function export.mark(text, itemType, face, lang)
	local tag = { "", "" }

	if itemType == "gloss" then
		tag = { '<span class="mention-gloss-double-quote">“</span><span class="mention-gloss">',
			'</span><span class="mention-gloss-double-quote">”</span>' }
	elseif itemType == "tr" then
		if face == "term" then
			tag = { '<span lang="' .. lang:getCode() .. '" class="tr mention-tr Latn">',
				'</span>' }
		else
			tag = { '<span lang="' .. lang:getCode() .. '" class="tr Latn">', '</span>' }
		end
	elseif itemType == "ts" then
		tag = { '<span class="ts mention-ts Latn">/', '/</span>' }
	elseif itemType == "pos" then
		tag = { '<span class="ann-pos">', '</span>' }
	elseif itemType == "annotations" then
		tag = { '<span class="mention-gloss-paren annotation-paren">(</span>',
			'<span class="mention-gloss-paren annotation-paren">)</span>' }
	end

	if type(text) == "string" then
		return tag[1] .. text .. tag[2]
	else
		return ""
	end
end

-- Format the annotations (things following the linked term)
function export.format_link_annotations(data, face)
	local output = {}

	-- Interwiki link
	if data.interwiki then
		table_insert(output, data.interwiki)
	end

	-- Genders
	if type(data.genders) ~= "table" then
		data.genders = { data.genders }
	end

	if data.genders and #data.genders > 0 then
		local m_gen = require("Module:gender and number")
		table_insert(output, "&nbsp;" .. m_gen.format_list(data.genders, data.lang))
	end

	local annotations = {}

	-- Transliteration and transcription
	if data.tr and data.tr[1] or data.ts and data.ts[1] then
		local kind
		if face == "term" then
			kind = face
		else
			kind = "default"
		end

		if data.tr[1] and data.ts[1] then
			table_insert(annotations,
				require("Module:script utilities").tag_translit(data.tr[1], data.lang, kind)
				.. " " .. export.mark(data.ts[1], "ts"))
		elseif data.ts[1] then
			table_insert(annotations, export.mark(data.ts[1], "ts"))
		else
			table_insert(annotations,
				require("Module:script utilities").tag_translit(data.tr[1], data.lang, kind))
		end
	end

	-- Gloss/translation
	if data.gloss then
		table_insert(annotations, export.mark(data.gloss, "gloss"))
	end

	-- Part of speech
	if data.pos then
		-- debug category for pos= containing transcriptions
		if data.pos:find("/[^><]*/") then
			data.pos = data.pos .. "[[Category:links likely containing transcriptions in pos]]"
		end

		pos_tags = pos_tags or mw.loadData("Module:links/data").pos_tags
		table_insert(annotations, export.mark(pos_tags[data.pos] or data.pos, "pos"))
	end

	-- Literal/sum-of-parts meaning
	if data.lit then
		table_insert(annotations, "literally " .. export.mark(data.lit, "gloss"))
	end

	if #annotations > 0 then
		table_insert(output, " " .. export.mark(table_concat(annotations, ", "), "annotations"))
	end

	return table_concat(output)
end

-- A version of {{l}} or {{m}} that can be called from other modules too
function export.full_link(data, face, allow_self_link, no_check_redundant_translit)
	if type(data) ~= "table" then
		error("The first argument to the function full_link must be a table. "
			.. "See Module:links/documentation for more information.")
	end
	
	local multiparams = {"term", "alt", "sc", "tr", "ts"}
	local terms = {true}
	
	-- Generate multiple forms if applicable.
	for i = 1, 2 do
		if (type(data[multiparams[i]]) == "string" and data[multiparams[i]]:find("//")) then
			local u = uchar
			data[multiparams[i]] = data[multiparams[i]]
				:gsub("\\\\//", u(0xE000) .. "//")
				:gsub("\\//", u(0xE001))
			data[multiparams[i]] = mw.text.split(data[multiparams[i]], "//") or {}
			for j, subparam in ipairs(data[multiparams[i]]) do
				data[multiparams[i]][j] = subparam
					:gsub(u(0xE000), "\\")
					:gsub(u(0xE001), "//")
				if subparam == "" then data[multiparams[i]][j] = nil end
			end
		elseif type(data[multiparams[i]]) == "string" and not (type(data.term) == "string" and data.term:find("//")) then
			data[multiparams[i]] = data.lang:generateForms(data[multiparams[i]])
		elseif type(data[multiparams[i]]) == "table" and #data[multiparams[i]] == 1 then
			data[multiparams[i]] = data.lang:generateForms(data[multiparams[i]][1])
		end
	end
	
	for i, multiparam in ipairs(multiparams) do
		data[multiparam] = data[multiparam] or {}
		if type(data[multiparam]) == "string" then
			data[multiparam] = {data[multiparam]}
		elseif data[multiparam] and data[multiparam]._type == "script object" then
			data[multiparam] = {data[multiparam]}
		end
		for i, subparam in pairs(data[multiparam]) do
			terms[i] = true
		end
	end
	
	-- Create the link
	local output = {}
	local categories = {}
	local link = ""
	local annotations
	
	local phonetic_extraction = phonetic_extraction or mw.loadData("Module:links/data").phonetic_extraction
	
	for i in ipairs(terms) do
		-- Is there any text to show?
		if (data.term[i] or data.alt[i]) then
			-- Try to detect the script if it was not provided
			local best = data.lang:findBestScript(data.alt[i] or data.term[i])
			if not data.sc[i] then
				data.sc[i] = best
			else
				-- Track uses of sc parameter
				require("Module:debug/track")("links/sc")

				if data.sc[i]:getCode() == best:getCode() then
					require("Module:debug/track")("links/sc/redundant")
					require("Module:debug/track")("links/sc/redundant/" .. data.sc[i]:getCode())
				else
					require("Module:debug/track")("links/sc/needed")
					require("Module:debug/track")("links/sc/needed/" .. data.sc[i]:getCode())
				end
			end
			
			-- If using a discouraged character sequence, add to maintenance category
			if data.sc[i]:hasNormalizationFixes() == true then
				if (data.term[i] and data.sc[i]:fixDiscouragedSequences(toNFC(data.term[i])) ~= toNFC(data.term[i])) or (data.alt[i] and data.sc[i]:fixDiscouragedSequences(toNFC(data.alt[i])) ~= toNFC(data.alt[i])) then
					table.insert(categories, "[[Category:Pages using discouraged character sequences]]")
				end
			end

			local class = ""

			-- Encode certain characters to avoid various delimiter-related issues at various stages. We need to encode < and >
			-- because they end up forming part of CSS class names inside of <span ...> and will interfere with finding the end
			-- of the HTML tag. I first tried converting them to URL encoding, i.e. %3C and %3E; they then appear in the URL as
			-- %253C and %253E, which get mapped back to %3C and %3E when passed to [[Module:accel]]. But mapping them to &lt;
			-- and &gt; somehow works magically without any further work; they appear in the URL as < and >, and get passed to
			-- [[Module:accel]] as < and >. I have no idea who along the chain of calls is doing the encoding and decoding. If
			-- someone knows, please modify this comment appropriately!
			local encode_accel_char_map = {
				["%"] = ".",
				[" "] = "_",
				["<"] = "&lt;",
				[">"] = "&gt;",
			}
			local function encode_accel_param_chars(param)
				local retval = param:gsub("[% <>]", encode_accel_char_map) -- discard second return value
				return retval
			end

			local function encode_accel_param(prefix, param)
				if not param then
					return ""
				end
				if type(param) == "table" then
					local filled_params = {}
					-- There may be gaps in the sequence, especially for translit params.
					local maxindex = 0
					for k, v in pairs(param) do
						if type(k) == "number" and k > maxindex then
							maxindex = k
						end
					end
					for i=1,maxindex do
						filled_params[i] = param[i] or ""
					end
					-- [[Module:accel]] splits these up again.
					param = table.concat(filled_params, "*~!")
				end
				-- This is decoded again by [[WT:ACCEL]].
				return prefix .. encode_accel_param_chars(param)
			end

			if data.accel then
				local form = data.accel.form and encode_accel_param_chars(data.accel.form) .. "-form-of" or ""
				local gender = encode_accel_param("gender-", data.accel.gender)
				local pos = encode_accel_param("pos-", data.accel.pos)
				local translit = encode_accel_param("transliteration-",
					data.accel.translit or (data.tr[i] ~= "-" and data.tr[i] or nil))
				local target = encode_accel_param("target-", data.accel.target)
				local lemma = encode_accel_param("origin-", data.accel.lemma)
				local lemma_translit = encode_accel_param("origin_transliteration-", data.accel.lemma_translit)
				local no_store = data.accel.no_store and "form-of-nostore" or ""

				local accel =
					form .. " " ..
					gender .. " " ..
					pos .. " " ..
					translit .. " " ..
					target .. " " ..
					lemma .. " " ..
					lemma_translit .. " " ..
					no_store .. " "

				class = "form-of lang-" .. data.lang:getCode() .. " " .. accel
			end
			
			-- Only make a link if the term has been given, otherwise just show the alt text without a link
			local term_data = {term = data.term[i], alt = data.alt[i], lang = data.lang, sc = data.sc[i], id = data.id, genders = data.genders, tr = data.tr[i], ts = data.ts[i], gloss = data.gloss, pos = data.pos, lit = data.lit, accel = data.accel, interwiki = data.interwiki}
			
			link = require("Module:script utilities").tag_text(
				data.term[i] and export.language_link(term_data, allow_self_link)
				or data.alt[i], data.lang, data.sc[i], face, class)
		else
			--[[	No term to show.
					Is there at least a transliteration we can work from?	]]
			link = require("Module:script utilities").request_script(data.lang, data.sc[i])

			if link == "" or not data.tr[i] or data.tr[i] == "-" then
				-- No link to show, and no transliteration either. Show a term request.
				local category = ""

				if mw.title.getCurrentTitle().nsText ~= "Template" then
					table_insert(categories, "[[Category:" .. data.lang:getCanonicalName() .. " term requests]]")
				end
				
				link = "<small>[Term?]</small>"
			end
		end
		table_insert(output, link)
		if i < #terms then table_insert(output, "'''／'''") end
	end
	
	if data.tr[1] == "" or data.tr[1] == "-" then
		data.tr[1] = nil

	elseif phonetic_extraction[data.lang:getCode()] then
		local m_phonetic = require(phonetic_extraction[data.lang:getCode()])
		data.tr[1] = data.tr[1] or m_phonetic.getTranslit(export.remove_links(data.alt[1] or data.term[1]))

	elseif (data.term[1] or data.alt[1]) and not data.sc[1]:getCode():find("Lati?n") then

		-- Try to generate a transliteration, unless transliteration has been supplied and either
		-- no_check_redundant_translit is given or we are in a high-memory entry. (Checking for redundant
		-- transliteration can use up significant amounts of memory so we don't want to do it if memory
		-- is tight. `no_check_redundant_translit` is currently set when called ultimately from
		-- {{multitrans|...|no-check-redundant-translit=1}}.)
		if not (data.tr[1] and (
			no_check_redundant_translit or
			mw.loadData("Module:links/data").high_memory_entries[mw.title.getCurrentTitle().text]
		)) then
			local automated_tr = data.lang:transliterate(export.remove_links(data.alt[1] or data.term[1]), data.sc[1])

			if automated_tr then
				local manual_tr = data.tr[1]

				if manual_tr then
					if manual_tr == automated_tr then
						table_insert(categories,
							"[[Category:Terms with redundant transliterations]]"
									.. "[[Category:Terms with redundant transliterations/" .. data.lang:getCode() .. "]]")
					else
						-- Prevents Arabic root categories from flooding the tracking categories.
						if mw.title.getCurrentTitle().nsText ~= "Category" then
							table_insert(categories,
								"[[Category:Terms with manual transliterations different from the automated ones]]"
										.. "[[Category:Terms with manual transliterations different from the automated ones/" .. data.lang:getCode() .. "]]")
						end
					end
				end

				if (not manual_tr) or data.lang:overrideManualTranslit() then
					data.tr[1] = automated_tr
				end
			end
		end
	end

	-- Link to the transliteration entry for languages that require this
	if data.tr[1] and data.lang:link_tr() then
		data.tr[1] = export.language_link { lang = data.lang, term = data.tr[1] }
	end

	table_insert(output, export.format_link_annotations(data, face))

	return table_concat(output) .. table_concat(categories)
end


--[[	Strips links: deletes category links,
		the targets of piped links,
		and all double square brackets.			]]
function export.remove_links(text)
	if type(text) == "table" then
		text = text.args[1]
	end

	if not text or text == "" then
		return ""
	end

	text = ugsub(text, "%[%[Category:[^|%]]-|?[^|%]]-%]%]", "")
	text = text:gsub("%[%[[^|%]]-|", "")
	text = text:gsub("%[%[", "")
	text = text:gsub("%]%]", "")

	return text
end

function export.english_links(text)
	local lang = require("Module:languages").getByCode("en")
	local sc = lang:findBestScript(text)

	-- Parentheses around function call to remove second return value, the
	-- number of replacements.
	return (text:gsub("%[%[([^%]]+)%]%]",
		function(linktext)
			local link = parseLink(linktext)
			return makeLangLink(link, lang, sc, nil, true, false)
		end))
end

--[=[
This decodes old section encodings.
For example, Norwegian_Bokm.C3.A5l → Norwegian_Bokmål.
It isn't picky about whether the section encodings represent the UTF-8 encoding
of a real Unicode character, so it will mangle section names that contain
a period followed by two uppercase hex characters. At least such section names
are probably pretty rare.

Wiktionary adds an additional id="" attribute for sections
using a legacy encoding, if it is different from the modern minimally modified attribute.
It is like percent encoding (URI or URL encoding) except with "." instead of "%".
See [[mw:Manual:$wgFragmentMode]] and the code that does the encoding at
https://gerrit.wikimedia.org/r/plugins/gitiles/mediawiki/core/+/7bf779524ab1fd8e1d74f79ea4840564d48eea4d/includes/parser/Sanitizer.php#893
]=]

-- The character class %x should not be used, as it includes the characters a-f,
-- which do not occur in these anchor encodings.
local capitalHex = "[0-9A-F]"

local function decodeAnchor(anchor)
	return (anchor:gsub("%.(" .. capitalHex .. capitalHex .. ")",
		function(hexByte)
			return string.char(tonumber(hexByte, 16))
		end))
end

function export.section_link(link)
	if type(link) ~= "string" then
		error("The first argument to section_link was a " .. type(link) .. ", but it should be a string.")
	end

	link = link:gsub("_", " ")

	local numberSigns = select(2, link:gsub("#", ""))

	if numberSigns > 1 then
		error("The section link should only contain one number sign (#).")
	end

	link = mw.uri.decode(link, "WIKI")
	local page, section = link:match("^([^#]*)#(.+)$")
	if page == "" then
		page = nil
	end

	if section then
		section = decodeAnchor(section)

		-- URI-encode (percent-encode) section to allow square brackets and
		-- other dodgy characters in section name.
		-- If not percent-encoded, they prevent the parser from creating a link.
		-- Decode percent-encoding in the displayed text
		if page then
			return "[[" .. page .. "#" .. mw.uri.encode(section, "WIKI")
				.. "|" .. page .. " §&nbsp;" .. section .. "]]"
		else
			return "[[#" .. mw.uri.encode(section, "WIKI")
				.. "|§&nbsp;" .. section .. "]]"
		end
	else
		error("The function “section_link” could not find a number sign marking a section name.")
	end
end

return export