epicyon/languages.py

329 lines
10 KiB
Python
Raw Normal View History

__filename__ = "languages.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
2022-02-03 13:58:20 +00:00
__version__ = "1.3.0"
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
2021-07-19 19:40:04 +00:00
import json
from urllib import request, parse
2021-12-26 10:22:19 +00:00
from utils import get_actor_languages_list
2021-12-27 15:43:22 +00:00
from utils import remove_html
2021-12-26 10:57:03 +00:00
from utils import has_object_dict
2021-12-26 14:08:58 +00:00
from utils import get_config_param
2021-12-26 10:19:59 +00:00
from utils import local_actor_url
2021-12-29 21:55:09 +00:00
from cache import get_person_from_cache
2021-12-29 21:55:09 +00:00
def get_actor_languages(actor_json: {}) -> str:
"""Returns a string containing languages used by the given actor
"""
2021-12-26 10:35:37 +00:00
lang_list = get_actor_languages_list(actor_json)
if not lang_list:
return ''
2022-01-02 21:27:49 +00:00
languages_str = ''
2021-12-26 10:35:37 +00:00
for lang in lang_list:
2022-01-02 21:27:49 +00:00
if languages_str:
languages_str += ' / ' + lang
else:
2022-01-02 21:27:49 +00:00
languages_str = lang
return languages_str
def get_understood_languages(base_dir: str, http_prefix: str,
nickname: str, domain_full: str,
person_cache: {}) -> []:
"""Returns a list of understood languages for the given account
"""
person_url = local_actor_url(http_prefix, nickname, domain_full)
actor_json = \
get_person_from_cache(base_dir, person_url, person_cache, False)
if not actor_json:
print('WARN: unable to load actor to obtain languages ' + person_url)
return []
return get_actor_languages_list(actor_json)
def set_actor_languages(actor_json: {}, languages_str: str) -> None:
"""Sets the languages understood by the given actor
"""
languages_str = languages_str.strip()
separator = None
2022-02-26 13:47:33 +00:00
possible_separators = (',', '/', ';', '+', ' ')
for poss in possible_separators:
if poss in languages_str:
separator = poss
break
if separator:
lang_list = languages_str.lower().split(separator)
else:
lang_list = [languages_str.lower()]
2021-12-26 10:35:37 +00:00
lang_list2 = ''
for lang in lang_list:
lang = lang.strip()
if lang_list2:
if ' ' + lang not in lang_list2:
2022-02-26 13:43:27 +00:00
lang_list2 += ', ' + lang
else:
2022-02-26 13:43:27 +00:00
lang_list2 += lang
# remove any existing value
2022-01-02 21:27:49 +00:00
property_found = None
2021-12-26 10:32:45 +00:00
for property_value in actor_json['attachment']:
2022-05-11 16:10:38 +00:00
name_value = None
if property_value.get('name'):
name_value = property_value['name']
elif property_value.get('schema:name'):
name_value = property_value['schema:name']
if not name_value:
continue
2021-12-26 10:32:45 +00:00
if not property_value.get('type'):
continue
2022-05-11 16:10:38 +00:00
if not name_value.lower().startswith('languages'):
continue
2022-01-02 21:27:49 +00:00
property_found = property_value
break
2022-01-02 21:27:49 +00:00
if property_found:
actor_json['attachment'].remove(property_found)
2021-12-26 10:35:37 +00:00
if not lang_list2:
return
2022-01-02 21:27:49 +00:00
new_languages = {
"name": "Languages",
"type": "PropertyValue",
2021-12-26 10:35:37 +00:00
"value": lang_list2
}
2022-01-02 21:27:49 +00:00
actor_json['attachment'].append(new_languages)
2021-12-29 21:55:09 +00:00
def understood_post_language(base_dir: str, nickname: str, domain: str,
message_json: {}, system_language: str,
http_prefix: str, domain_full: str,
person_cache: {}) -> bool:
"""Returns true if the post is written in a language
understood by this account
"""
2022-01-02 21:27:49 +00:00
msg_object = message_json
2021-12-26 10:57:03 +00:00
if has_object_dict(message_json):
2022-01-02 21:27:49 +00:00
msg_object = message_json['object']
if not msg_object.get('contentMap'):
return True
2022-01-02 21:27:49 +00:00
if not isinstance(msg_object['contentMap'], dict):
return True
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(system_language):
return True
2022-01-02 21:27:49 +00:00
person_url = local_actor_url(http_prefix, nickname, domain_full)
2021-12-29 21:55:09 +00:00
actor_json = \
2022-01-02 21:27:49 +00:00
get_person_from_cache(base_dir, person_url, person_cache, False)
2021-12-26 10:29:52 +00:00
if not actor_json:
2022-01-02 21:27:49 +00:00
print('WARN: unable to load actor to check languages ' + person_url)
return False
2021-12-26 10:52:54 +00:00
languages_understood = get_actor_languages_list(actor_json)
if not languages_understood:
return True
2021-12-26 10:52:54 +00:00
for lang in languages_understood:
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(lang):
return True
2021-07-19 19:40:04 +00:00
# is the language for this post supported by libretranslate?
2022-01-02 21:45:26 +00:00
libretranslate_url = get_config_param(base_dir, "libretranslateUrl")
2022-01-02 21:27:49 +00:00
if libretranslate_url:
libretranslate_api_key = \
2021-12-26 14:08:58 +00:00
get_config_param(base_dir, "libretranslateApiKey")
2021-12-26 10:35:37 +00:00
lang_list = \
2022-01-02 21:27:49 +00:00
libretranslate_languages(libretranslate_url,
libretranslate_api_key)
2021-12-26 10:35:37 +00:00
for lang in lang_list:
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(lang):
2021-07-19 19:40:04 +00:00
return True
return False
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
def libretranslate_languages(url: str, api_key: str = None) -> []:
2021-07-19 19:40:04 +00:00
"""Returns a list of supported languages
"""
2021-08-08 11:16:18 +00:00
if not url:
return []
2021-07-19 19:40:04 +00:00
if not url.endswith('/languages'):
if not url.endswith('/'):
url += "/languages"
else:
url += "languages"
2022-05-30 18:33:51 +00:00
params = {}
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
if api_key:
params["api_key"] = api_key
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
url_params = parse.urlencode(params)
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
req = request.Request(url, data=url_params.encode())
2021-07-19 19:40:04 +00:00
2022-05-30 18:33:51 +00:00
response_str = ''
with request.urlopen(req) as response:
response_str = response.read().decode()
2021-07-19 19:40:04 +00:00
result = json.loads(response_str)
if not result:
return []
if not isinstance(result, list):
return []
2021-12-26 10:35:37 +00:00
lang_list = []
2021-07-19 19:40:04 +00:00
for lang in result:
if not isinstance(lang, dict):
continue
if not lang.get('code'):
continue
2022-01-02 21:27:49 +00:00
lang_code = lang['code']
if len(lang_code) != 2:
2021-07-19 19:40:04 +00:00
continue
2022-01-02 21:27:49 +00:00
lang_list.append(lang_code)
2021-12-26 10:35:37 +00:00
lang_list.sort()
return lang_list
2021-07-19 19:40:04 +00:00
2021-12-29 21:55:09 +00:00
def get_links_from_content(content: str) -> {}:
2021-07-20 10:45:04 +00:00
"""Returns a list of links within the given content
"""
if '<a href' not in content:
return {}
2021-07-20 10:45:04 +00:00
sections = content.split('<a href')
first = True
links = {}
2021-07-20 10:45:04 +00:00
for subsection in sections:
if first:
first = False
continue
if '"' not in subsection:
continue
url = subsection.split('"')[1].strip()
if '://' in url and '.' in url and \
'>' in subsection:
2021-07-20 10:45:04 +00:00
if url not in links:
2022-01-02 21:27:49 +00:00
link_text = subsection.split('>')[1]
if '<' in link_text:
link_text = link_text.split('<')[0]
links[link_text] = url
2021-07-20 10:45:04 +00:00
return links
2021-12-29 21:55:09 +00:00
def add_links_to_content(content: str, links: {}) -> str:
2021-07-20 18:02:42 +00:00
"""Adds links back into plain text
"""
2022-01-02 21:27:49 +00:00
for link_text, url in links.items():
url_desc = url
if link_text.startswith('@') and link_text in content:
2021-07-20 18:02:42 +00:00
content = \
2022-01-02 21:27:49 +00:00
content.replace(link_text,
2021-07-20 18:02:42 +00:00
'<a href="' + url +
'" rel="nofollow noopener ' +
'noreferrer" target="_blank">' +
2022-01-02 21:27:49 +00:00
link_text + '</a>')
2021-07-20 18:02:42 +00:00
else:
2022-01-02 21:27:49 +00:00
if len(url_desc) > 40:
url_desc = url_desc[:40]
2021-07-20 18:02:42 +00:00
content += \
'<p><a href="' + url + \
'" rel="nofollow noopener noreferrer" target="_blank">' + \
2022-01-02 21:27:49 +00:00
url_desc + '</a></p>'
2021-07-20 18:02:42 +00:00
return content
2021-08-08 11:16:18 +00:00
def libretranslate(url: str, text: str,
2022-01-02 21:27:49 +00:00
source: str, target: str, api_key: str = None) -> str:
2021-07-19 19:40:04 +00:00
"""Translate string using libretranslate
"""
2021-08-08 11:16:18 +00:00
if not url:
return None
2021-07-19 19:40:04 +00:00
if not url.endswith('/translate'):
if not url.endswith('/'):
url += "/translate"
else:
url += "translate"
2022-01-02 21:27:49 +00:00
original_text = text
2021-07-20 20:12:24 +00:00
2021-07-20 10:45:04 +00:00
# get any links from the text
2021-12-29 21:55:09 +00:00
links = get_links_from_content(text)
2021-07-20 10:45:04 +00:00
# LibreTranslate doesn't like markup
2021-12-27 15:43:22 +00:00
text = remove_html(text)
2021-07-20 20:04:49 +00:00
# remove any links from plain text version of the content
2022-01-02 21:27:49 +00:00
for _, url2 in links.items():
text = text.replace(url2, '')
2021-07-20 20:04:49 +00:00
2022-01-02 21:27:49 +00:00
lt_params = {
2021-07-19 19:40:04 +00:00
"q": text,
"source": source,
"target": target
}
2022-01-02 21:27:49 +00:00
if api_key:
lt_params["api_key"] = api_key
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
url_params = parse.urlencode(lt_params)
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
req = request.Request(url, data=url_params.encode())
2022-05-30 18:33:51 +00:00
response_str = None
2021-07-20 20:09:39 +00:00
try:
2022-05-30 18:33:51 +00:00
with request.urlopen(req) as response:
response_str = response.read().decode()
except BaseException as ex:
print('EX: Unable to translate: ' + text + ' ' + str(ex))
2022-01-02 21:27:49 +00:00
return original_text
2021-07-19 19:40:04 +00:00
2022-05-30 18:33:51 +00:00
if not response_str:
return original_text
2021-07-19 19:40:04 +00:00
2022-01-02 21:27:49 +00:00
translated_text = \
2021-07-20 10:45:04 +00:00
'<p>' + json.loads(response_str)['translatedText'] + '</p>'
2021-07-20 10:46:58 +00:00
# append links form the original text
2021-07-20 10:45:04 +00:00
if links:
2022-01-02 21:27:49 +00:00
translated_text = add_links_to_content(translated_text, links)
return translated_text
2021-07-19 19:40:04 +00:00
2021-12-29 21:55:09 +00:00
def auto_translate_post(base_dir: str, post_json_object: {},
system_language: str, translate: {}) -> str:
2021-07-19 19:40:04 +00:00
"""Tries to automatically translate the given post
"""
2021-12-26 10:57:03 +00:00
if not has_object_dict(post_json_object):
2021-07-19 19:40:04 +00:00
return ''
2022-01-02 21:27:49 +00:00
msg_object = post_json_object['object']
if not msg_object.get('contentMap'):
2021-07-19 19:40:04 +00:00
return ''
2022-01-02 21:27:49 +00:00
if not isinstance(msg_object['contentMap'], dict):
2021-07-19 19:40:04 +00:00
return ''
# is the language for this post supported by libretranslate?
2022-01-02 21:27:49 +00:00
libretranslate_url = get_config_param(base_dir, "libretranslateUrl")
if not libretranslate_url:
2021-07-19 19:40:04 +00:00
return ''
2022-01-02 21:27:49 +00:00
libretranslate_api_key = get_config_param(base_dir, "libretranslateApiKey")
2021-12-26 10:35:37 +00:00
lang_list = \
2022-01-02 21:27:49 +00:00
libretranslate_languages(libretranslate_url, libretranslate_api_key)
2021-12-26 10:35:37 +00:00
for lang in lang_list:
content = None
2022-01-02 21:27:49 +00:00
if msg_object['contentMap'].get(lang):
content = msg_object['contentMap'][lang]
if not content:
continue
translated_text = \
libretranslate(libretranslate_url, content,
lang, system_language,
libretranslate_api_key)
if translated_text:
if remove_html(translated_text) == remove_html(content):
return content
2022-01-02 21:27:49 +00:00
translated_text = \
'<p>' + translate['Translated'].upper() + '</p>' + \
translated_text
return translated_text
2021-07-19 19:40:04 +00:00
return ''