Source code for searxng_extra.update.update_engine_traits

#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`

:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
  Persistence of engines traits, fetched from the engines.

:origin:`searx/languages.py`
  Is generated  from intersecting each engine's supported traits.

The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
the :origin:`CI Update data ... <.github/workflows/data-update.yml>`

"""

# pylint: disable=invalid-name
from unicodedata import lookup
from pathlib import Path
from pprint import pformat
import babel

from searx import settings, searx_dir
from searx import network
from searx.engines import load_engines
from searx.enginelib.traits import EngineTraitsMap

# Output files.
languages_file = Path(searx_dir) / 'sxng_locales.py'
languages_file_header = """\
# SPDX-License-Identifier: AGPL-3.0-or-later
'''List of SearXNG's locale codes.

.. hint::

   Don't modify this file, this file is generated by::

     ./manage data.traits
'''

sxng_locales = (
"""
languages_file_footer = """,
)
'''
A list of five-digit tuples:

0. SearXNG's internal locale tag (a language or region tag)
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
   Empty string for language tags.
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
   are represented by a globe (\U0001F310)

.. code:: python

   ('en',    'English', '',              'English', '\U0001f310'),
   ('en-CA', 'English', 'Canada',        'English', '\U0001f1e8\U0001f1e6'),
   ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
   ..
   ('fr',    'Français', '',             'French',  '\U0001f310'),
   ('fr-BE', 'Français', 'Belgique',     'French',  '\U0001f1e7\U0001f1ea'),
   ('fr-CA', 'Français', 'Canada',       'French',  '\U0001f1e8\U0001f1e6'),

:meta hide-value:
'''
"""


lang2emoji = {
    'ha': '\U0001F1F3\U0001F1EA',  # Hausa / Niger
    'bs': '\U0001F1E7\U0001F1E6',  # Bosnian / Bosnia & Herzegovina
    'jp': '\U0001F1EF\U0001F1F5',  # Japanese
    'ua': '\U0001F1FA\U0001F1E6',  # Ukrainian
    'he': '\U0001F1EE\U0001F1F1',  # Hebrew
}


def main():
    load_engines(settings['engines'])
    # traits_map = EngineTraitsMap.from_data()
    traits_map = fetch_traits_map()
    sxng_tag_list = filter_locales(traits_map)
    write_languages_file(sxng_tag_list)


[docs] def fetch_traits_map(): """Fetchs supported languages for each engine and writes json file with those.""" network.set_timeout_for_thread(10.0) def log(msg): print(msg) traits_map = EngineTraitsMap.fetch_traits(log=log) print("fetched properties from %s engines" % len(traits_map)) print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) traits_map.save_data() return traits_map
[docs] def filter_locales(traits_map: EngineTraitsMap): """Filter language & region tags by a threshold.""" min_eng_per_region = 15 min_eng_per_lang = 20 _ = {} for eng in traits_map.values(): for reg in eng.regions.keys(): _[reg] = _.get(reg, 0) + 1 regions = set(k for k, v in _.items() if v >= min_eng_per_region) lang_from_region = set(k.split('-')[0] for k in regions) _ = {} for eng in traits_map.values(): for lang in eng.languages.keys(): # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they # already counted by existence of 'zh' or 'sr', 'pa') if '_' in lang: # print("ignore %s" % lang) continue _[lang] = _.get(lang, 0) + 1 languages = set(k for k, v in _.items() if v >= min_eng_per_lang) sxng_tag_list = set() sxng_tag_list.update(regions) sxng_tag_list.update(lang_from_region) sxng_tag_list.update(languages) return sxng_tag_list
def write_languages_file(sxng_tag_list): language_codes = [] for sxng_tag in sorted(sxng_tag_list): sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-') flag = get_unicode_flag(sxng_locale) or '' item = ( sxng_tag, sxng_locale.get_language_name().title(), # type: ignore sxng_locale.get_territory_name() or '', sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '', UnicodeEscape(flag), ) language_codes.append(item) language_codes = tuple(language_codes) with languages_file.open('w', encoding='utf-8') as new_file: file_content = "{header} {language_codes}{footer}".format( header=languages_file_header, language_codes=pformat(language_codes, width=120, indent=4)[1:-1], footer=languages_file_footer, ) new_file.write(file_content) new_file.close()
[docs] class UnicodeEscape(str): """Escape unicode string in :py:obj:`pprint.pformat`""" def __repr__(self): return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
[docs] def get_unicode_flag(locale: babel.Locale): """Determine a unicode flag (emoji) that fits to the ``locale``""" emoji = lang2emoji.get(locale.language) if emoji: return emoji if not locale.territory: return '\U0001F310' emoji = lang2emoji.get(locale.territory.lower()) if emoji: return emoji try: c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0]) c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1]) # print("OK : %s --> %s%s" % (locale, c1, c2)) except KeyError as exc: print("ERROR: %s --> %s" % (locale, exc)) return None return c1 + c2
if __name__ == "__main__": main()