zotero/scripts/locale/merge_csl_locales

167 lines
4.6 KiB
Text
Raw Normal View History

#!/usr/bin/env python3
import json
import os
import sys
import urllib.request
import shutil
import xml.etree.ElementTree as ET
import unicodedata
LOCALES_GIT_REF = "master"
def main():
if len(sys.argv) < 2 or not os.path.isdir(sys.argv[1]):
sys.stderr.write(
"Usage: {0} path/to/zotero/source\n".format(os.path.basename(sys.argv[0]))
)
return 1
source_dir = sys.argv[1]
schema_dir = os.path.join(source_dir, "resource", "schema")
csl_locale_base = "https://raw.githubusercontent.com/citation-style-language/locales/{ref}/locales-{lang}.xml"
# Codes for the language packs that we want to grab
language_packs = [
"af-ZA",
"ar",
"bg-BG",
"ca-AD",
"cs-CZ",
"cy-GB",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-GB",
"en-US",
"es-CL",
"es-ES",
"es-MX",
"et-EE",
"eu",
"fa-IR",
"fi-FI",
"fr-CA",
"fr-FR",
"he-IL",
"hi-IN",
"hr-HR",
"hu-HU",
"id-ID",
"is-IS",
"it-IT",
"ja-JP",
"km-KH",
"ko-KR",
"la",
"lt-LT",
"lv-LV",
"mn-MN",
"nb-NO",
"nl-NL",
"nn-NO",
"pl-PL",
"pt-BR",
"pt-PT",
"ro-RO",
"ru-RU",
"sk-SK",
"sl-SI",
"sr-RS",
"sv-SE",
"th-TH",
"tr-TR",
"uk-UA",
"vi-VN",
"zh-CN",
"zh-TW",
]
number_formats = {}
for lang in language_packs:
url = csl_locale_base.format(ref=LOCALES_GIT_REF, lang=lang)
print("Loading from " + url)
with urllib.request.urlopen(url) as response:
code = response.getcode()
if code != 200:
sys.stderr.write("Got {0} for {1}\n".format(code, url))
return 1
xml = ET.parse(response)
# first, pull out all the translations for "edition", "editions", and "ed."
edition_locators = set()
for elem in xml.findall(".//{*}term[@name='edition']"):
edition_locators.update(get_all_values(elem))
edition_locators = list(edition_locators)
# next, the translations for "-st", "-nd", "-rd", and "-th"
short_ordinal_suffixes = set()
for term in xml.findall(".//{*}term"):
name = term.attrib.get("name", "")
value = term.text
if not (name.startswith("ordinal") and value):
continue
short_ordinal_suffixes.add(value)
short_ordinal_suffixes.add(strip_superscript_chars(value))
short_ordinal_suffixes = list(short_ordinal_suffixes)
# lastly, the translations for "first", "second", "third", etc.
long_ordinals = {}
for term in xml.findall(".//{*}term"):
name = term.attrib.get("name", "")
if not name.startswith("long-ordinal-"):
continue
long_ordinals[term.text] = int(
term.attrib.get("name", "").rsplit("-", 1)[1]
) # parse the "01" in "long-ordinal-01"
number_formats[lang] = {
"locators": {"edition": edition_locators},
"ordinals": {"short": short_ordinal_suffixes, "long": long_ordinals},
}
number_formats[
"credit"
] = f"Generated from the CSL locales repository <https://github.com/citation-style-language/locales/tree/{LOCALES_GIT_REF}> by https://github.com/zotero/zotero-build/blob/master/locale/merge_csl_locales"
with open(os.path.join(schema_dir, "cslLocaleStrings.json"), "w") as outfile:
json.dump(number_formats, outfile, ensure_ascii=False, indent='\t')
print(f'Saved combined locales to {os.path.join(schema_dir, "cslLocaleStrings.json")}')
def get_all_values(elem):
text = elem.text.strip()
single = elem.findtext("{*}single")
multiple = elem.findtext("{*}multiple")
if text:
yield text
if single:
yield single
if multiple:
yield multiple
def strip_superscript_chars(s):
"""Replace all Unicode superscript modifier characters in a string with their non-superscript
counterparts and return the modified string."""
output = []
for c in s:
decomposition = unicodedata.decomposition(c)
if decomposition.startswith("<super> "):
output.append(
chr(int(unicodedata.decomposition(c)[len("<super> ") :], base=16))
)
else:
output.append(c)
return "".join(output)
if __name__ == "__main__":
sys.exit(main())