epicyon/languages.py

405 lines
13 KiB
Python
Raw Normal View History

__filename__ = "languages.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2024-12-22 23:37:30 +00:00
__version__ = "1.6.0"
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
2021-07-19 19:40:04 +00:00
import json
2022-12-08 15:28:17 +00:00
import os
2021-07-19 19:40:04 +00:00
from urllib import request, parse
2024-05-12 12:35:26 +00:00
from utils import data_dir
2022-12-08 15:28:17 +00:00
from utils import is_account_dir
from utils import acct_dir
2021-12-26 10:22:19 +00:00
from utils import get_actor_languages_list
2021-12-27 15:43:22 +00:00
from utils import remove_html
2021-12-26 10:57:03 +00:00
from utils import has_object_dict
2021-12-26 14:08:58 +00:00
from utils import get_config_param
2021-12-26 10:19:59 +00:00
from utils import local_actor_url
2024-01-27 17:04:21 +00:00
from utils import resembles_url
2021-12-29 21:55:09 +00:00
from cache import get_person_from_cache
2021-12-29 21:55:09 +00:00
def get_actor_languages(actor_json: {}) -> str:
"""Returns a string containing languages used by the given actor
"""
2021-12-26 10:35:37 +00:00
lang_list = get_actor_languages_list(actor_json)
if not lang_list:
return ''
2022-01-02 21:27:49 +00:00
languages_str = ''
2021-12-26 10:35:37 +00:00
for lang in lang_list:
2022-01-02 21:27:49 +00:00
if languages_str:
languages_str += ' / ' + lang
else:
2022-01-02 21:27:49 +00:00
languages_str = lang
return languages_str
def get_understood_languages(base_dir: str, http_prefix: str,
nickname: str, domain_full: str,
person_cache: {}) -> []:
"""Returns a list of understood languages for the given account
"""
person_url = local_actor_url(http_prefix, nickname, domain_full)
actor_json = \
2022-06-09 16:54:44 +00:00
get_person_from_cache(base_dir, person_url, person_cache)
if not actor_json:
print('WARN: unable to load actor to obtain languages ' + person_url)
return []
return get_actor_languages_list(actor_json)
def set_actor_languages(actor_json: {}, languages_str: str) -> None:
"""Sets the languages understood by the given actor
"""
languages_str = languages_str.strip()
separator = None
2022-02-26 13:47:33 +00:00
possible_separators = (',', '/', ';', '+', ' ')
for poss in possible_separators:
if poss in languages_str:
separator = poss
break
if separator:
lang_list = languages_str.lower().split(separator)
else:
lang_list = [languages_str.lower()]
2021-12-26 10:35:37 +00:00
lang_list2 = ''
for lang in lang_list:
lang = lang.strip()
if lang_list2:
if ' ' + lang not in lang_list2:
2022-02-26 13:43:27 +00:00
lang_list2 += ', ' + lang
else:
2022-02-26 13:43:27 +00:00
lang_list2 += lang
# remove any existing value
2022-01-02 21:27:49 +00:00
property_found = None
2021-12-26 10:32:45 +00:00
for property_value in actor_json['attachment']:
2022-05-11 16:10:38 +00:00
name_value = None
if property_value.get('name'):
name_value = property_value['name']
elif property_value.get('schema:name'):
name_value = property_value['schema:name']
if not name_value:
continue
2021-12-26 10:32:45 +00:00
if not property_value.get('type'):
continue
2022-05-11 16:10:38 +00:00
if not name_value.lower().startswith('languages'):
continue
2022-01-02 21:27:49 +00:00
property_found = property_value
break
2022-01-02 21:27:49 +00:00
if property_found:
actor_json['attachment'].remove(property_found)
2021-12-26 10:35:37 +00:00
if not lang_list2:
return
2022-01-02 21:27:49 +00:00
new_languages = {
"name": "Languages",
"type": "PropertyValue",
2021-12-26 10:35:37 +00:00
"value": lang_list2
}
2022-01-02 21:27:49 +00:00
actor_json['attachment'].append(new_languages)
2022-06-14 10:51:40 +00:00
def understood_post_language(base_dir: str, nickname: str,
2021-12-29 21:55:09 +00:00
message_json: {}, system_language: str,
http_prefix: str, domain_full: str,
person_cache: {}) -> bool:
"""Returns true if the post is written in a language
understood by this account
"""
2022-01-02 21:27:49 +00:00
msg_object = message_json
2021-12-26 10:57:03 +00:00
if has_object_dict(message_json):
2022-01-02 21:27:49 +00:00
msg_object = message_json['object']
if not msg_object.get('contentMap'):
return True
2022-01-02 21:27:49 +00:00
if not isinstance(msg_object['contentMap'], dict):
return True
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(system_language):
return True
2022-01-02 21:27:49 +00:00
person_url = local_actor_url(http_prefix, nickname, domain_full)
2021-12-29 21:55:09 +00:00
actor_json = \
2022-06-09 16:54:44 +00:00
get_person_from_cache(base_dir, person_url, person_cache)
2021-12-26 10:29:52 +00:00
if not actor_json:
2022-01-02 21:27:49 +00:00
print('WARN: unable to load actor to check languages ' + person_url)
return False
2021-12-26 10:52:54 +00:00
languages_understood = get_actor_languages_list(actor_json)
if not languages_understood:
return True
2021-12-26 10:52:54 +00:00
for lang in languages_understood:
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(lang):
return True
2021-07-19 19:40:04 +00:00
# is the language for this post supported by libretranslate?
2022-01-02 21:45:26 +00:00
libretranslate_url = get_config_param(base_dir, "libretranslateUrl")
2022-01-02 21:27:49 +00:00
if libretranslate_url:
libretranslate_api_key = \
2021-12-26 14:08:58 +00:00
get_config_param(base_dir, "libretranslateApiKey")
2021-12-26 10:35:37 +00:00
lang_list = \
2022-01-02 21:27:49 +00:00
libretranslate_languages(libretranslate_url,
libretranslate_api_key)
2021-12-26 10:35:37 +00:00
for lang in lang_list:
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(lang):
2021-07-19 19:40:04 +00:00
return True
return False
2021-07-19 19:40:04 +00:00
2024-02-19 15:38:08 +00:00
def libretranslate_languages(url: str, api_key: str) -> []:
2021-07-19 19:40:04 +00:00
"""Returns a list of supported languages
"""
2021-08-08 11:16:18 +00:00
if not url:
return []
2021-07-19 19:40:04 +00:00
if not url.endswith('/languages'):
if not url.endswith('/'):
url += "/languages"
else:
url += "languages"
2022-05-30 18:33:51 +00:00
params = {}
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
if api_key:
params["api_key"] = api_key
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
url_params = parse.urlencode(params)
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
req = request.Request(url, data=url_params.encode())
2021-07-19 19:40:04 +00:00
2022-05-30 18:33:51 +00:00
response_str = ''
with request.urlopen(req) as response:
response_str = response.read().decode()
2021-07-19 19:40:04 +00:00
try:
result = json.loads(response_str)
except json.decoder.JSONDecodeError as ex:
print('EX: json decode error ' + str(ex) +
' from libretranslate_languages ' +
str(response_str))
return []
2021-07-19 19:40:04 +00:00
if not result:
return []
if not isinstance(result, list):
return []
2024-12-23 17:45:20 +00:00
lang_list: list[str] = []
2021-07-19 19:40:04 +00:00
for lang in result:
if not isinstance(lang, dict):
continue
if not lang.get('code'):
continue
2022-01-02 21:27:49 +00:00
lang_code = lang['code']
if len(lang_code) != 2:
2021-07-19 19:40:04 +00:00
continue
2022-01-02 21:27:49 +00:00
lang_list.append(lang_code)
2021-12-26 10:35:37 +00:00
lang_list.sort()
return lang_list
2021-07-19 19:40:04 +00:00
2021-12-29 21:55:09 +00:00
def get_links_from_content(content: str) -> {}:
2021-07-20 10:45:04 +00:00
"""Returns a list of links within the given content
"""
if '<a href' not in content:
return {}
2021-07-20 10:45:04 +00:00
sections = content.split('<a href')
first = True
links = {}
2021-07-20 10:45:04 +00:00
for subsection in sections:
if first:
first = False
continue
if '"' not in subsection:
continue
url = subsection.split('"')[1].strip()
2024-01-27 17:04:21 +00:00
if resembles_url(url) and \
'>' in subsection:
2021-07-20 10:45:04 +00:00
if url not in links:
2022-01-02 21:27:49 +00:00
link_text = subsection.split('>')[1]
if '<' in link_text:
link_text = link_text.split('<')[0]
links[link_text] = url
2021-07-20 10:45:04 +00:00
return links
2021-12-29 21:55:09 +00:00
def add_links_to_content(content: str, links: {}) -> str:
2021-07-20 18:02:42 +00:00
"""Adds links back into plain text
"""
2022-01-02 21:27:49 +00:00
for link_text, url in links.items():
url_desc = url
if link_text.startswith('@') and link_text in content:
2021-07-20 18:02:42 +00:00
content = \
2022-01-02 21:27:49 +00:00
content.replace(link_text,
2021-07-20 18:02:42 +00:00
'<a href="' + url +
'" rel="nofollow noopener ' +
'noreferrer" target="_blank">' +
2022-01-02 21:27:49 +00:00
link_text + '</a>')
2021-07-20 18:02:42 +00:00
else:
2022-01-02 21:27:49 +00:00
if len(url_desc) > 40:
url_desc = url_desc[:40]
2021-07-20 18:02:42 +00:00
content += \
'<p><a href="' + url + \
'" rel="nofollow noopener noreferrer" target="_blank">' + \
2022-01-02 21:27:49 +00:00
url_desc + '</a></p>'
2021-07-20 18:02:42 +00:00
return content
2021-08-08 11:16:18 +00:00
def libretranslate(url: str, text: str,
2024-02-19 13:41:52 +00:00
source: str, target: str, api_key: str) -> str:
2021-07-19 19:40:04 +00:00
"""Translate string using libretranslate
"""
2021-08-08 11:16:18 +00:00
if not url:
return None
2021-07-19 19:40:04 +00:00
if not url.endswith('/translate'):
if not url.endswith('/'):
url += "/translate"
else:
url += "translate"
2022-01-02 21:27:49 +00:00
original_text = text
2021-07-20 20:12:24 +00:00
2021-07-20 10:45:04 +00:00
# get any links from the text
2021-12-29 21:55:09 +00:00
links = get_links_from_content(text)
2021-07-20 10:45:04 +00:00
# LibreTranslate doesn't like markup
2021-12-27 15:43:22 +00:00
text = remove_html(text)
2021-07-20 20:04:49 +00:00
# remove any links from plain text version of the content
2022-01-02 21:27:49 +00:00
for _, url2 in links.items():
text = text.replace(url2, '')
2021-07-20 20:04:49 +00:00
2022-01-02 21:27:49 +00:00
lt_params = {
2021-07-19 19:40:04 +00:00
"q": text,
"source": source,
"target": target
}
2022-01-02 21:27:49 +00:00
if api_key:
lt_params["api_key"] = api_key
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
url_params = parse.urlencode(lt_params)
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
req = request.Request(url, data=url_params.encode())
2022-05-30 18:33:51 +00:00
response_str = None
2021-07-20 20:09:39 +00:00
try:
2022-05-30 18:33:51 +00:00
with request.urlopen(req) as response:
response_str = response.read().decode()
except BaseException as ex:
print('EX: Unable to translate: ' + text + ' ' + str(ex))
2022-01-02 21:27:49 +00:00
return original_text
2021-07-19 19:40:04 +00:00
2022-05-30 18:33:51 +00:00
if not response_str:
return original_text
2021-07-19 19:40:04 +00:00
try:
translated_text = \
'<p>' + json.loads(response_str)['translatedText'] + '</p>'
except json.decoder.JSONDecodeError as ex:
print('EX: json decode error ' + str(ex) +
' from libretranslate ' +
str(response_str))
return original_text
2021-07-20 10:46:58 +00:00
# append links form the original text
2021-07-20 10:45:04 +00:00
if links:
2022-01-02 21:27:49 +00:00
translated_text = add_links_to_content(translated_text, links)
return translated_text
2021-07-19 19:40:04 +00:00
2021-12-29 21:55:09 +00:00
def auto_translate_post(base_dir: str, post_json_object: {},
system_language: str, translate: {}) -> str:
2021-07-19 19:40:04 +00:00
"""Tries to automatically translate the given post
"""
2021-12-26 10:57:03 +00:00
if not has_object_dict(post_json_object):
2021-07-19 19:40:04 +00:00
return ''
2022-01-02 21:27:49 +00:00
msg_object = post_json_object['object']
if not msg_object.get('contentMap'):
2021-07-19 19:40:04 +00:00
return ''
2022-01-02 21:27:49 +00:00
if not isinstance(msg_object['contentMap'], dict):
2021-07-19 19:40:04 +00:00
return ''
# is the language for this post supported by libretranslate?
2022-01-02 21:27:49 +00:00
libretranslate_url = get_config_param(base_dir, "libretranslateUrl")
if not libretranslate_url:
2021-07-19 19:40:04 +00:00
return ''
2022-01-02 21:27:49 +00:00
libretranslate_api_key = get_config_param(base_dir, "libretranslateApiKey")
2021-12-26 10:35:37 +00:00
lang_list = \
2022-01-02 21:27:49 +00:00
libretranslate_languages(libretranslate_url, libretranslate_api_key)
2021-12-26 10:35:37 +00:00
for lang in lang_list:
content = None
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(lang):
content = msg_object['contentMap'][lang]
if not content:
continue
translated_text = \
libretranslate(libretranslate_url, content,
lang, system_language,
libretranslate_api_key)
if translated_text:
if remove_html(translated_text) == remove_html(content):
return content
2022-01-02 21:27:49 +00:00
translated_text = \
'<p>' + translate['Translated'].upper() + '</p>' + \
translated_text
return translated_text
2021-07-19 19:40:04 +00:00
return ''
2022-12-08 15:28:17 +00:00
def set_default_post_language(base_dir: str, nickname: str, domain: str,
language: str) -> None:
"""Sets the default language for new posts
"""
default_post_language_filename = \
acct_dir(base_dir, nickname, domain) + '/.new_post_language'
try:
with open(default_post_language_filename, 'w+',
encoding='utf-8') as fp_lang:
fp_lang.write(language)
except OSError:
print('EX: Unable to write default post language ' +
default_post_language_filename)
def load_default_post_languages(base_dir: str) -> {}:
"""Returns a dictionary containing the default languages
for new posts for each account
"""
result = {}
2024-05-12 12:35:26 +00:00
dir_str = data_dir(base_dir)
for _, dirs, _ in os.walk(dir_str):
2022-12-08 15:28:17 +00:00
for handle in dirs:
if not is_account_dir(handle):
continue
nickname = handle.split('@')[0]
domain = handle.split('@')[1]
default_post_language_filename = \
acct_dir(base_dir, nickname, domain) + '/.new_post_language'
if not os.path.isfile(default_post_language_filename):
continue
try:
with open(default_post_language_filename, 'r',
encoding='utf-8') as fp_lang:
result[nickname] = fp_lang.read()
except OSError:
print('EX: Unable to read default post language ' +
default_post_language_filename)
2023-07-28 12:54:02 +00:00
break
2022-12-08 15:28:17 +00:00
return result
def get_reply_language(base_dir: str,
post_json_object: {}) -> str:
"""Returns the language that te given post was written in
"""
post_obj = post_json_object
if has_object_dict(post_json_object):
post_obj = post_json_object['object']
if not post_obj.get('contentMap'):
return None
for lang, _ in post_obj['contentMap'].items():
lang_filename = base_dir + '/translations/' + lang + '.json'
if not os.path.isfile(lang_filename):
continue
return lang
return None