zotero/scripts/dictionaries/build-dictionaries

#!/usr/bin/env python3

import urllib.request, json, re, time
from functools import cmp_to_key
from locale import strcoll

locales = {
    # Keep in sync with locale.js in zotero-client
    'ar': 'عربي',
    'bg-BG': 'Български',
    'br': 'brezhoneg',
    'ca-AD': 'Català',
    'cs-CZ': 'Čeština',
    'da-DK': 'Dansk',
    'de': 'Deutsch (Deutschland)',
    'en-CA': 'English (Canada)',
    'en-US': 'English',
    'en-GB': 'English (UK)',
    'es-ES': 'Español',
    'et-EE': 'Eesti keel',
    'fa': 'فارسی',
    'fi-FI': 'suomi',
    'fr-FR': 'Français',
    'gl-ES': 'Galego',
    'hu-HU': 'magyar',
    'id-ID': 'Bahasa Indonesia',
    'is-IS': 'íslenska',
    'it-IT': 'Italiano',
    'ja-JP': '日本語',
    'km': 'ខ្មែរ',
    'ko-KR': '한국어',
    'lt-LT': 'Lietuvių',
    'nl-NL': 'Nederlands',
    'nb-NO': 'Norsk bokmål',
    'pl-PL': 'Polski',
    'pt-BR': 'Português (do Brasil)',
    'pt-PT': 'Português (Europeu)',
    'ro-RO': 'Română',
    'ru-RU': 'Русский',
    'sk-SK': 'slovenčina',
    'sl-SI': 'Slovenščina',
    'sr-RS': 'Српски',
    'sv-SE': 'Svenska',
    'ta': 'தமிழ்',
    'th-TH': 'ไทย',
    'tr-TR': 'Türkçe',
    'uk-UA': 'Українська',
    'vi-VN': 'Tiếng Việt',
    'zh-CN': '中文 (简体)',
    'zh-TW': '正體中文 (繁體)',

    # Additional dictionaries not included as client locales
    # Names from https://addons.mozilla.org/en-US/firefox/language-tools/
    'de-AT': 'Deutsch (Österreich)',
    'de-CH': 'Deutsch (Schweiz)',
    'el-GR': 'Ελληνικά',
    'es-AR': 'Español (de Argentina)',
    'es-MX': 'Español (de México)',
    'he-HE': 'עברית',
    'hr-HR': 'Hrvatski',
    'lv-LV': 'Latviešu',
}

# Locales to sort before other variants
primary_locales = ['de-DE', 'en-US', 'es-ES']

# Generate list of available dictionaries, sorted by user count descending
dictionaries = []
with urllib.request.urlopen("https://services.addons.mozilla.org/api/v4/addons/language-tools/?app=firefox&type=dictionary") as resp:
    dictionary_info_list = json.loads(resp.read().decode())['results']
    n = 0
    for dictionary_info in dictionary_info_list:
        n += 1
        locale = dictionary_info['target_locale']
        guid = dictionary_info['guid']
        print(str(n) + '/' + str(len(dictionary_info_list)) + ': ' + dictionary_info['target_locale'])
        with urllib.request.urlopen("https://services.addons.mozilla.org/api/v4/addons/search/?guid=" + guid) as resp:
            dictionary = json.loads(resp.read().decode())['results'][0]
            if dictionary['is_disabled'] or dictionary['status'] != 'public':
                print('skipping ' + locale + ' ' + guid)
                continue
            dictionaries.append({
                'id': guid,
                'locale': locale,
                'version': dictionary['current_version']['version'],
                'updated': dictionary['last_updated'],
                'url': dictionary['current_version']['files'][0]['url'],
                'users': dictionary['average_daily_users'],
            })
            time.sleep(1)
    dictionaries.sort(key=lambda x: x.get('users'), reverse=True)

# Find dictionaries best matching the specified locales
final_dictionaries = []
for locale in locales:
    locale_lang = re.split('[-_]', locale)[0]
    # A locale code with the language duplicated (e.g., 'de-DE'), which may not
    # be the actual code
    locale_lang_full = "{}-{}".format(locale_lang, locale_lang.upper())

    for i, d in enumerate(dictionaries[:]):
        # Exact match
        if (d['locale'] == locale
                # locale 'de' == dict 'de-DE'
                or (len(locale) == 2 and d['locale'] == locale_lang_full)
                # locale 'bg-BG' -> dict 'bg'
                or (locale == locale_lang_full and d['locale'] == locale_lang)):
            d['name'] = locales[locale]
            final_dictionaries.append(d)
            del(dictionaries[i])
            break
    else:
        # If nothing found, allow missing differing region ('cs-cZ' -> 'cs')
        if len(locale) != 2 and locale != locale_lang_full:
            for i, d in enumerate(dictionaries[:]):
                if d['locale'] == locale_lang:
                    d['name'] = locales[locale]
                    final_dictionaries.append(d)
                    del(dictionaries[i])
                    break

# Sort dictionaries by language code, with a few exceptions
def cmp(a, b):
    for locale in primary_locales:
        if a['locale'] == locale and b['locale'].startswith(locale[0:3]):
            return -1
        if b['locale'] == locale and a['locale'].startswith(locale[0:3]):
            return 1
    return strcoll(a['locale'], b['locale'])

final_dictionaries = sorted(final_dictionaries, key=cmp_to_key(cmp))

print("")

for d in final_dictionaries:
    print("Downloading {}".format(d['url']))
    urllib.request.urlretrieve(
        d['url'],
        'dictionaries/' + d['id'] + '-' + d['version'] + '.xpi'
    )
    del(d['url'])
    time.sleep(1)

with open('dictionaries/dictionaries.json', 'w', encoding='utf-8') as f:
    json.dump(final_dictionaries, f, ensure_ascii=False, sort_keys=True, indent='\t')

# Save a list of unused dictionaries
with open('dictionaries/dictionaries-unused.json', 'w', encoding='utf-8') as f:
    json.dump(dictionaries, f, ensure_ascii=False, indent='\t')

print('\ndone')