Module:Citation/CS1/Configuration: Difference between revisions

From Vigyanwiki
< Module:Citation/CS1
(bump pmc;)
m (1 revision imported)
 
(22 intermediate revisions by 6 users not shown)
Line 1: Line 1:
local lang_obj = mw.language.getContentLanguage(); -- make a language object for the local language; used here for languages and dates  
local lang_obj = mw.language.getContentLanguage(); -- make a language object for the local language; used here for languages and dates  
--[[--------------------------< S E T T I N G S >--------------------------------------------------------------
boolean settings used to control various things.  these setting located here to make them easy to find
]]
-- these settings local to this module only
local local_digits_from_mediawiki = false; -- for i18n; when true, module fills date_names['local_digits'] from MediaWiki; manual fill required else; always false at en.wiki
local local_date_names_from_mediawiki = false; -- for i18n; when true, module fills date_names['local']['long'] and date_names['local']['short'] from MediaWiki;
-- manual translation required else; ; always false at en.wiki
-- these settings exported to other modules
local use_identifier_redirects = true; -- when true use redirect name for identifier label links; always true at en.wiki
local local_lang_cat_enable = false; -- when true categorizes pages where |language=<local wiki's language>; always false at en.wiki
local date_name_auto_xlate_enable = false; -- when true translates English month-names to the local-wiki's language month names; always false at en.wiki
local date_digit_auto_xlate_enable = false; -- when true translates Western date digit to the local-wiki's language digits (date_names['local_digits']); always false at en.wiki


--[[--------------------------< U N C A T E G O R I Z E D _ N A M E S P A C E S >------------------------------
--[[--------------------------< U N C A T E G O R I Z E D _ N A M E S P A C E S >------------------------------


List of namespaces that should not be included in citation error categories.
List of namespaces identifiers for namespaces that will not be included in citation error categories.
Same as setting notracking = true by default.
Same as setting notracking = true by default.


Note: Namespace names should use underscores instead of spaces.
For wikis that have a current version of Module:cs1 documentation support, this #invoke will return an unordered
list of namespace names and their associated identifiers:
{{#invoke:cs1 documentation support|uncategorized_namespace_lister|all=<anything>}}


]]
]]


local uncategorized_namespaces = { 'User', 'Talk', 'User_talk', 'Wikipedia_talk',
uncategorized_namespaces_t = {[2]=true}; -- init with user namespace id
'File_talk', 'Template_talk', 'Help_talk', 'Category_talk', 'Portal_talk',
for k, _ in pairs (mw.site.talkNamespaces) do -- add all talk namespace ids
'Book_talk', 'Draft_talk', 'Education_Program_talk', 'Module_talk', 'MediaWiki_talk' };
uncategorized_namespaces_t[k] = true;
end
 
local uncategorized_subpages = {'/[Ss]andbox', '/[Tt]estcases', '/[^/]*[Ll]og', '/[Aa]rchive'}; -- list of Lua patterns found in page names of pages we should not categorize
local uncategorized_subpages = {'/[Ss]andbox', '/[Tt]estcases', '/[^/]*[Ll]og', '/[Aa]rchive'}; -- list of Lua patterns found in page names of pages we should not categorize


Line 30: Line 52:
['archived-dead'] = 'Archived from $1 on $2',
['archived-dead'] = 'Archived from $1 on $2',
['archived-live'] = '$1 from the original on $2',
['archived-live'] = '$1 from the original on $2',
['archived-missing'] = 'Archived from the original$1 on $2',
['archived-unfit'] = 'Archived from the original on ',
['archived-unfit'] = 'Archived from the original on ',
['archived'] = 'Archived',
['archived'] = 'Archived',
Line 44: Line 65:
['inset'] = '$1 inset',
['inset'] = '$1 inset',
['interview'] = 'Interviewed by $1',
['interview'] = 'Interviewed by $1',
['lay summary'] = 'Lay summary',
['mismatch'] = '<code class="cs1-code">&#124;$1=</code> / <code class="cs1-code">&#124;$2=</code> mismatch', -- $1 is year param name; $2 is date param name
['mismatch'] = '<code class="cs1-code">&#124;$1=</code> / <code class="cs1-code">&#124;$2=</code> mismatch', -- $1 is year param name; $2 is date param name
['newsgroup'] = '[[Usenet newsgroup|Newsgroup]]:&nbsp;$1',
['newsgroup'] = '[[Usenet newsgroup|Newsgroup]]:&nbsp;$1',
Line 62: Line 82:


['vol'] = '$1 Vol.&nbsp;$2', -- $1 is sepc; bold journal style volume is in presentation{}
['vol'] = '$1 Vol.&nbsp;$2', -- $1 is sepc; bold journal style volume is in presentation{}
['vol-no'] = '$1 Vol.&nbsp;$2, no.&nbsp;$3', -- sepc, volume, issue (alternatively insert $1 after $2, but then we'd also have to change capitalization)
['vol-no'] = '$1 Vol.&nbsp;$2, no.&nbsp;$3', -- sepc, volume, issue (alternatively insert $1 after $2, but then we'd also have to change capitalization)
['issue'] = '$1 No.&nbsp;$2', -- $1 is sepc
['issue'] = '$1 No.&nbsp;$2', -- $1 is sepc
['art'] = '$1 Art.&nbsp;$2', -- $1 is sepc; for {{cite conference}} only
['vol-art'] = '$1 Vol.&nbsp;$2, art.&nbsp;$3', -- sepc, volume, article-number; for {{cite conference}} only


['j-vol'] = '$1 $2', -- sepc, volume; bold journal volume is in presentation{}
['j-vol'] = '$1 $2', -- sepc, volume; bold journal volume is in presentation{}
['j-issue'] = ' ($1)',
['j-issue'] = ' ($1)',
['j-article-num'] = ' $1', -- TODO: any punctuation here? static text?


['nopp'] = '$1 $2'; -- page(s) without prefix; $1 is sepc
['nopp'] = '$1 $2'; -- page(s) without prefix; $1 is sepc
Line 110: Line 134:
used as class attributes in the <cite> tag that encloses the citation so these names may not contain spaces while
used as class attributes in the <cite> tag that encloses the citation so these names may not contain spaces while
the canonical template name may.  These names are used in warning_msg_e and warning_msg_m to create links to the
the canonical template name may.  These names are used in warning_msg_e and warning_msg_m to create links to the
template's documentation when an article is displayed in preivew mode.
template's documentation when an article is displayed in preview mode.


Most cs1|2 template |CitationClass= values at en.wiki match their canonical template names so are not listed here.
Most cs1|2 template |CitationClass= values at en.wiki match their canonical template names so are not listed here.
Line 117: Line 141:


local citation_class_map_t = { -- TODO: if kept, these and all other config.CitationClass 'names' require some sort of i18n
local citation_class_map_t = { -- TODO: if kept, these and all other config.CitationClass 'names' require some sort of i18n
['audio-visual'] = 'AV media', -- TODO: move to ~/Configuration
['arxiv'] = 'arXiv',
['audio-visual'] = 'AV media',
['AV-media-notes'] = 'AV media notes',
['AV-media-notes'] = 'AV media notes',
['biorxiv'] = 'bioRxiv',
['citeseerx'] = 'CiteSeerX',
['encyclopaedia'] = 'encyclopedia',
['encyclopaedia'] = 'encyclopedia',
['mailinglist'] = 'mailing list',
['mailinglist'] = 'mailing list',
['pressrelease'] = 'press release'
['medrxiv'] = 'medRxiv',
['pressrelease'] = 'press release',
['ssrn'] = 'SSRN',
['techreport'] = 'tech report',
}
}


Line 164: Line 194:


['format'] = ' <span class="cs1-format">($1)</span>', -- for |format=, |chapter-format=, etc.
['format'] = ' <span class="cs1-format">($1)</span>', -- for |format=, |chapter-format=, etc.
['interwiki'] = ' <span class="cs1-format">[in $1]</span>', -- for interwiki-language-linked author, editor, etc
['interproj'] = ' <span class="cs1-format">[at $1]</span>', -- for interwiki-project-linked author, editor, etc (:d: and :s: supported; :w: ignored)


-- various access levels, for |access=, |doi-access=, |arxiv=, ...
-- various access levels, for |access=, |doi-access=, |arxiv=, ...
Line 229: Line 261:
['ArchiveFormat'] = 'archive-format',
['ArchiveFormat'] = 'archive-format',
['ArchiveURL'] = {'archive-url', 'archiveurl'}, -- Used by InternetArchiveBot
['ArchiveURL'] = {'archive-url', 'archiveurl'}, -- Used by InternetArchiveBot
['ArticleNumber'] = 'article-number',
['ASINTLD'] = 'asin-tld',
['ASINTLD'] = 'asin-tld',
['At'] = 'at', -- Used by InternetArchiveBot
['At'] = 'at', -- Used by InternetArchiveBot
Line 264: Line 297:
['Issue'] = {'issue', 'number'},
['Issue'] = {'issue', 'number'},
['Language'] = {'language', 'lang'},
['Language'] = {'language', 'lang'},
['LayDate'] = 'lay-date',
['LayFormat'] = 'lay-format',
['LaySource'] = 'lay-source',
['LayURL'] = 'lay-url',
['MailingList'] = {'mailing-list', 'mailinglist'}, -- cite mailing list only
['MailingList'] = {'mailing-list', 'mailinglist'}, -- cite mailing list only
['Map'] = 'map', -- cite map only
['Map'] = 'map', -- cite map only
Line 315: Line 344:
['Title'] = 'title', -- Used by InternetArchiveBot
['Title'] = 'title', -- Used by InternetArchiveBot
['TitleLink'] = {'title-link', 'episode-link', 'episodelink'}, -- Used by InternetArchiveBot
['TitleLink'] = {'title-link', 'episode-link', 'episodelink'}, -- Used by InternetArchiveBot
['TitleNote'] = 'department',
['TitleNote'] = {'title-note', 'department'},
['TitleType'] = {'type', 'medium'},
['TitleType'] = {'type', 'medium'},
['TransChapter'] = {'trans-article', 'trans-chapter', 'trans-contribution',
['TransChapter'] = {'trans-article', 'trans-chapter', 'trans-contribution',
Line 321: Line 350:
['Transcript'] = 'transcript',
['Transcript'] = 'transcript',
['TranscriptFormat'] = 'transcript-format',
['TranscriptFormat'] = 'transcript-format',
['TranscriptURL'] = {'transcript-url', 'transcripturl'}, -- Used by InternetArchiveBot
['TranscriptURL'] = 'transcript-url', -- Used by InternetArchiveBot
['TransMap'] = 'trans-map', -- cite map only
['TransMap'] = 'trans-map', -- cite map only
['TransPeriodical'] = {'trans-journal', 'trans-magazine', 'trans-newspaper',
['TransPeriodical'] = {'trans-journal', 'trans-magazine', 'trans-newspaper',
Line 336: Line 365:
['Year'] = 'year',
['Year'] = 'year',


['AuthorList-First'] = {"first#", "author-first#", "author#-first", "given#",
['AuthorList-First'] = {"first#", "author-first#", "author#-first", "author-given#", "author#-given",
"author-given#", "author#-given"},
"subject-first#", "subject#-first", "subject-given#", "subject#-given",
['AuthorList-Last'] = {"last#", "author-last#", "author#-last", "surname#",
"given#"},
"author-surname#", "author#-surname", "author#", "subject#", 'host#'},
['AuthorList-Last'] = {"last#", "author-last#", "author#-last", "author-surname#", "author#-surname",
"subject-last#", "subject#-last", "subject-surname#", "subject#-surname",
"author#", 'host#', "subject#", "surname#"},
['AuthorList-Link'] = {"author-link#", "author#-link", "subject-link#",
['AuthorList-Link'] = {"author-link#", "author#-link", "subject-link#",
"subject#-link", "authorlink#", "author#link"},
"subject#-link", "authorlink#", "author#link"},
Line 383: Line 414:
'AuthorList-Mask', 'ContributorList-Mask', 'EditorList-Mask', 'InterviewerList-Mask', 'TranslatorList-Mask', -- name-list mask may have name separators
'AuthorList-Mask', 'ContributorList-Mask', 'EditorList-Mask', 'InterviewerList-Mask', 'TranslatorList-Mask', -- name-list mask may have name separators
'PostScript', 'Quote', 'ScriptQuote', 'TransQuote', 'Ref', -- miscellaneous
'PostScript', 'Quote', 'ScriptQuote', 'TransQuote', 'Ref', -- miscellaneous
'ArchiveURL', 'ChapterURL', 'ConferenceURL', 'LayURL', 'MapURL', 'TranscriptURL', 'URL', -- URL-holding parameters
'ArchiveURL', 'ChapterURL', 'ConferenceURL', 'MapURL', 'TranscriptURL', 'URL', -- URL-holding parameters
}
}


local url_meta_params = { -- table of aliases[] keys (meta parameters); each key has a table of parameter names for a value
local url_meta_params = { -- table of aliases[] keys (meta parameters); each key has a table of parameter names for a value
'ArchiveURL', 'ChapterURL', 'ConferenceURL', 'ID', 'LayURL', 'MapURL', 'TranscriptURL', 'URL', -- parameters allowed to hold urls
'ArchiveURL', 'ChapterURL', 'ConferenceURL', 'ID', 'MapURL', 'TranscriptURL', 'URL', -- parameters allowed to hold urls
'Page', 'Pages', 'At', 'QuotePage', 'QuotePages', -- insource locators allowed to hold urls
'Page', 'Pages', 'At', 'QuotePage', 'QuotePages', -- insource locators allowed to hold urls
}
}
Line 412: Line 443:
local punct_skip = {};
local punct_skip = {};
local url_skip = {};
local url_skip = {};
--[[--------------------------< S I N G L E - L E T T E R  S E C O N D - L E V E L  D O M A I N S >----------
this is a list of tlds that are known to have single-letter second-level domain names.  This list does not include
ccTLDs which are accepted in is_domain_name().
]]
local single_letter_2nd_lvl_domains_t = {'cash', 'company', 'foundation', 'org', 'today'};




Line 425: Line 466:
local special_case_translation = {
local special_case_translation = {
['AuthorList'] = 'authors list', -- used to assemble maintenance category names
['AuthorList'] = 'authors list', -- used to assemble maintenance category names
['ContributorList'] = 'contributors list', -- translation of these names plus translation of the base mainenance category names in maint_cats{} table below
['ContributorList'] = 'contributors list', -- translation of these names plus translation of the base maintenance category names in maint_cats{} table below
['EditorList'] = 'editors list', -- must match the names of the actual categories
['EditorList'] = 'editors list', -- must match the names of the actual categories
['InterviewerList'] = 'interviewers list', -- this group or translations used by name_has_ed_markup() and name_has_mult_names()
['InterviewerList'] = 'interviewers list', -- this group or translations used by name_has_ed_markup() and name_has_mult_names()
Line 476: Line 517:
{['en'] = {'about us', true}, ['local'] = nil},
{['en'] = {'about us', true}, ['local'] = nil},
{['en'] = {'%f[%a][Aa]dvisor%f[%A]', false}, ['local'] = nil},
{['en'] = {'%f[%a][Aa]dvisor%f[%A]', false}, ['local'] = nil},
{['en'] = {'allmusic', true}, ['local'] = nil},
{['en'] = {'%f[%a][Aa]uthor%f[%A]', false}, ['local'] = nil},
{['en'] = {'%f[%a][Aa]uthor%f[%A]', false}, ['local'] = nil},
{['en'] = {'business', true}, ['local'] = nil},
{['en'] = {'cnn', true}, ['local'] = nil},
{['en'] = {'collaborator', true}, ['local'] = nil},
{['en'] = {'collaborator', true}, ['local'] = nil},
{['en'] = {'contributor', true}, ['local'] = nil},
{['en'] = {'contributor', true}, ['local'] = nil},
Line 491: Line 535:
{['en'] = {'google', true}, ['local'] = nil},
{['en'] = {'google', true}, ['local'] = nil},
{['en'] = {'home page', true}, ['local'] = nil},
{['en'] = {'home page', true}, ['local'] = nil},
{['en'] = {'^[Ii]nc%.?$', false}, ['local'] = nil},
{['en'] = {'instagram', true}, ['local'] = nil},
{['en'] = {'instagram', true}, ['local'] = nil},
{['en'] = {'interviewer', true}, ['local'] = nil},
{['en'] = {'interviewer', true}, ['local'] = nil},
Line 498: Line 543:
{['en'] = {'policy', true}, ['local'] = nil},
{['en'] = {'policy', true}, ['local'] = nil},
{['en'] = {'privacy', true}, ['local'] = nil},
{['en'] = {'privacy', true}, ['local'] = nil},
{['en'] = {'reuters', true}, ['local'] = nil},
{['en'] = {'translator', true}, ['local'] = nil},
{['en'] = {'translator', true}, ['local'] = nil},
{['en'] = {'tumblr', true}, ['local'] = nil},
{['en'] = {'tumblr', true}, ['local'] = nil},
Line 591: Line 637:
date_names[invert_t[2]][i] = name; -- invert to get [i] = 'name' for conversions from ymd
date_names[invert_t[2]][i] = name; -- invert to get [i] = 'name' for conversions from ymd
end
end
end
if local_digits_from_mediawiki then -- if fetching local digits from MediaWiki is enabled
local digits_t = {};
for i=0, 9 do -- loop 10x and
digits_t [lang_obj:formatNum (i)] = tostring (i); -- format the loop indexer as local lang table index and assign loop indexer (a string) as the value
end
date_names['local_digits'] = digits_t;
end
end


Line 612: Line 666:
'{{ *([Mm]DY) *[|}]', -- 0
'{{ *([Mm]DY) *[|}]', -- 0
}
}
local title_object = mw.title.getCurrentTitle();
local content; -- done this way  so that unused templates appear in unused-template-reports; self-transcluded makes them look like they are used
if 10 ~= title_object.namespace then -- all namespaces except Template
content = title_object:getContent() or ''; -- get the content of the article or ''; new pages edited w/ve do not have 'content' until saved; ve does not preview; phab:T221625
end


local function get_date_format ()
local function get_date_format ()
local title_object = mw.title.getCurrentTitle();
-- if title_object.namespace == 10 then -- not in template space so that unused templates appear in unused-template-reports;  
if title_object.namespace == 10 then -- not in template space so that unused templates appear in unused-template-reports;  
if not content then -- nil content when we're in template
return nil; -- auto-formatting does not work in Template space so don't set global_df
return nil; -- auto-formatting does not work in Template space so don't set global_df
end
end
local content = title_object:getContent() or ''; -- get the content of the article or ''; new pages edited w/ve do not have 'content' until saved; ve does not preview; phab:T221625
for _, pattern in ipairs (df_template_patterns) do -- loop through the patterns looking for {{Use dmy dates}} or {{Use mdy dates}} or any of their redirects
for _, pattern in ipairs (df_template_patterns) do -- loop through the patterns looking for {{Use dmy dates}} or {{Use mdy dates}} or any of their redirects
local start, _, match = content:find(pattern); -- match is the three letters indicating desired date format
local start, _, match = content:find(pattern); -- match is the three letters indicating desired date format
if match then
if match then
content = content:match ('%b{}', start); -- get the whole template
local use_dates_template = content:match ('%b{}', start); -- get the whole template
if content:match ('| *cs1%-dates *= *[lsy][sy]?') then -- look for |cs1-dates=publication date length access-/archive-date length
if use_dates_template:match ('| *cs1%-dates *= *[lsy][sy]?') then -- look for |cs1-dates=publication date length access-/archive-date length
return match:lower() .. '-' .. content:match ('| *cs1%-dates *= *([lsy][sy]?)');
return match:lower() .. '-' .. use_dates_template:match ('| *cs1%-dates *= *([lsy][sy]?)');
else
else
return match:lower() .. '-all'; -- no |cs1-dates= k/v pair; return value appropriate for use in |df=
return match:lower() .. '-all'; -- no |cs1-dates= k/v pair; return value appropriate for use in |df=
Line 632: Line 691:
end
end


local global_df;
local global_df; -- TODO: add this to <global_cs1_config_t>?




Line 824: Line 883:




--[[---------------------< S T R I P M A R K E R S >----------------------------
--[[--------------------------< C S 1 _ C O N F I G _ G E T >--------------------------------------------------


Common pattern definition location for stripmarkers so that we don't have to go
fetch and validate values from {{cs1 config}} template to fill <global_cs1_config_t>
hunting for them if (when) MediaWiki changes their form.


]]
no error messages; when errors are detected, the parameter value from {{cs1 config}} is blanked.


local stripmarkers = {
Supports all parameters and aliases associated with the metaparameters: DisplayAuthors, DisplayContributors,
['any'] = '\127[^\127]*UNIQ%-%-(%a+)%-[%a%d]+%-QINU[^\127]*\127', -- capture returns name of stripmarker
DisplayEditors, DisplayInterviewers, DisplayTranslators, NameListStyle, and Mode.  The DisplayWhatever metaparameters
['math'] = '\127[^\127]*UNIQ%-%-math%-[%a%d]+%-QINU[^\127]*\127' -- math stripmarkers used in coins_cleanup() and coins_replace_math_stripmarker()
accept numeric values only (|display-authors=etal and the like is not supported).
}


]]


--[[------------< I N V I S I B L E _ C H A R A C T E R S >---------------------
local global_cs1_config_t = {}; -- TODO: add value returned from get_date_format() to this table?


This table holds non-printing or invisible characters indexed either by name or
local function get_cs1_config ()
by Unicode group. Values are decimal representations of UTF-8 codes.  The table
-- if title_object.namespace == 10 then -- not in template space so that unused templates appear in unused-template-reports;
is organized as a table of tables because the Lua pairs keyword returns table
if not content then -- nil content when we're in template
data in an arbitrary order.  Here, we want to process the table from top to bottom
return nil; -- auto-formatting does not work in Template space so don't set global_df
because the entries at the top of the table are also found in the ranges specified
end
by the entries at the bottom of the table.
 
local start = content:find('{{ *[Cc][Ss]1 config *[|}]'); -- <start> is offset into <content> when {{cs1 config}} found; nil else
if start then
local cs1_config_template = content:match ('%b{}', start); -- get the whole template


Also here is a pattern that recognizes stripmarkers that begin and end with the
if not cs1_config_template then
delete characters.  The nowiki stripmarker is not an error but some others are
return nil;
because the parameter values that include them become part of the template's
end
metadata before stripmarker replacement.


]]
local params_t = mw.text.split (cs1_config_template:gsub ('^{{%s*', ''):gsub ('%s*}}$', ''), '%s*|%s*'); -- remove '{{' and '}}'; make a sequence of parameter/value pairs (split on the pipe)
table.remove (params_t, 1); -- remove the template name because it isn't a parameter/value pair


local invisible_defs = {
local config_meta_params_t = {'DisplayAuthors', 'DisplayContributors', 'DisplayEditors', 'DisplayInterviewers', 'DisplayTranslators', 'NameListStyle', 'Mode'};
del = '\127', -- used to distinguish between stripmarker and del char
local meta_param_map_t = {}; -- list of accepted parameter names usable in {{cs1 config}} goes here
zwj = '\226\128\141', -- used with capture because zwj may be allowed
}
for _, meta_param in ipairs (config_meta_params_t) do -- for i18n using <config_meta_params_t>, map template parameter names to their metaparameter equivalents
if 'table' == type (aliases[meta_param]) then -- if <meta_param> is a sequence,
for _, param in ipairs (aliases[meta_param]) do -- extract its contents
meta_param_map_t[param] = meta_param; -- and add to <meta_param_map_t>
end
else
meta_param_map_t[aliases[meta_param]] = meta_param; -- not a sequence so just add the parameter to <meta_param_map_t>
end
end
 
local keywords_t = {}; -- map valid keywords to their associate metaparameter; reverse form of <keyword_lists[key] for these metaparameters
for _, metaparam_t in ipairs ({{'NameListStyle', 'name-list-style'}, {'Mode', 'mode'}}) do -- only these metaparameter / keywords_lists key pairs
for _, keyword in ipairs (keywords_lists[metaparam_t[2]]) do -- spin through the list of keywords
keywords_t[keyword] = metaparam_t[1]; -- add [keyword] = metaparameter to the map
end
end
 
for _, param in ipairs (params_t) do -- spin through the {{cs1 config}} parameters and fill <global_cs1_config_t>
local k, v = param:match ('([^=]-)%s*=%s*(.+)'); -- <k> is the parameter name; <v> is parameter's assigned value
if k then
if k:find ('^display') then -- if <k> is one of the |display-<namelist>= parameters
if v:match ('%d+') then -- the assigned value must be digits; doesn't accept 'etal'
global_cs1_config_t[meta_param_map_t[k]]=v; -- add the display param and its value to globals table
end
else
if keywords_t[v] == meta_param_map_t[k] then -- keywords_t[v] returns nil or the metaparam name; these must be the same
global_cs1_config_t[meta_param_map_t[k]]=v; -- add the parameter and its value to globals table
end
end
end
end
end
end
 
get_cs1_config (); -- fill <global_cs1_config_t>


local invisible_chars = {
{'replacement', '\239\191\189'}, -- U+FFFD, EF BF BD
{'zero width joiner', '('.. invisible_defs.zwj .. ')'}, -- U+200D, E2 80 8D; capture because zwj may be allowed
{'zero width space', '\226\128\139'}, -- U+200B, E2 80 8B
{'hair space', '\226\128\138'}, -- U+200A, E2 80 8A
{'soft hyphen', '\194\173'}, -- U+00AD, C2 AD
{'horizontal tab', '\009'}, -- U+0009 (HT), 09
{'line feed', '\010'}, -- U+000A (LF), 0A
{'no-break space', '\194\160'}, -- U+00A0 (NBSP), C2 A0
{'carriage return', '\013'}, -- U+000D (CR), 0D
{'stripmarker', stripmarkers.any}, -- stripmarker; may or may not be an error; capture returns the stripmaker type
{'delete', '('.. invisible_defs.del .. ')'}, -- U+007F (DEL), 7F; must be done after stripmarker test; capture to distinguish isolated del chars not part of stripmarker
{'C0 control', '[\000-\008\011\012\014-\031]'}, -- U+0000–U+001F (NULL–US), 00–1F (except HT, LF, CR (09, 0A, 0D))
{'C1 control', '[\194\128-\194\159]'}, -- U+0080–U+009F (XXX–APC), C2 80 – C2 9F
-- {'Specials', '[\239\191\185-\239\191\191]'}, -- U+FFF9-U+FFFF, EF BF B9 – EF BF BF
-- {'Private use area', '[\238\128\128-\239\163\191]'}, -- U+E000–U+F8FF, EE 80 80 – EF A3 BF
-- {'Supplementary Private Use Area-A', '[\243\176\128\128-\243\191\191\189]'}, -- U+F0000–U+FFFFD, F3 B0 80 80 – F3 BF BF BD
-- {'Supplementary Private Use Area-B', '[\244\128\128\128-\244\143\191\189]'}, -- U+100000–U+10FFFD, F4 80 80 80 – F4 8F BF BD
}


--[[
--[[---------------------< S T R I P M A R K E R S >----------------------------


Indic script makes use of zero width joiner as a character modifier so zwj
Common pattern definition location for stripmarkers so that we don't have to go
characters must be left in.  This pattern covers all of the unicode characters