2021-07-19 08:46:21 +00:00
|
|
|
__filename__ = "languages.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
2022-02-03 13:58:20 +00:00
|
|
|
__version__ = "1.3.0"
|
2021-07-19 08:46:21 +00:00
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2021-07-19 08:46:21 +00:00
|
|
|
__status__ = "Production"
|
|
|
|
__module_group__ = "Core"
|
|
|
|
|
2021-07-19 19:40:04 +00:00
|
|
|
import json
|
|
|
|
from urllib import request, parse
|
2021-12-26 10:22:19 +00:00
|
|
|
from utils import get_actor_languages_list
|
2021-12-27 15:43:22 +00:00
|
|
|
from utils import remove_html
|
2021-12-26 10:57:03 +00:00
|
|
|
from utils import has_object_dict
|
2021-12-26 14:08:58 +00:00
|
|
|
from utils import get_config_param
|
2021-12-26 10:19:59 +00:00
|
|
|
from utils import local_actor_url
|
2021-12-29 21:55:09 +00:00
|
|
|
from cache import get_person_from_cache
|
2021-07-19 08:46:21 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def get_actor_languages(actor_json: {}) -> str:
|
2021-07-19 08:46:21 +00:00
|
|
|
"""Returns a string containing languages used by the given actor
|
|
|
|
"""
|
2021-12-26 10:35:37 +00:00
|
|
|
lang_list = get_actor_languages_list(actor_json)
|
|
|
|
if not lang_list:
|
2021-07-19 08:46:21 +00:00
|
|
|
return ''
|
2022-01-02 21:27:49 +00:00
|
|
|
languages_str = ''
|
2021-12-26 10:35:37 +00:00
|
|
|
for lang in lang_list:
|
2022-01-02 21:27:49 +00:00
|
|
|
if languages_str:
|
|
|
|
languages_str += ' / ' + lang
|
2021-07-19 08:46:21 +00:00
|
|
|
else:
|
2022-01-02 21:27:49 +00:00
|
|
|
languages_str = lang
|
|
|
|
return languages_str
|
2021-07-19 08:46:21 +00:00
|
|
|
|
|
|
|
|
2022-01-28 11:29:01 +00:00
|
|
|
def get_understood_languages(base_dir: str, http_prefix: str,
|
|
|
|
nickname: str, domain_full: str,
|
|
|
|
person_cache: {}) -> []:
|
|
|
|
"""Returns a list of understood languages for the given account
|
|
|
|
"""
|
|
|
|
person_url = local_actor_url(http_prefix, nickname, domain_full)
|
|
|
|
actor_json = \
|
2022-06-09 16:54:44 +00:00
|
|
|
get_person_from_cache(base_dir, person_url, person_cache)
|
2022-01-28 11:29:01 +00:00
|
|
|
if not actor_json:
|
|
|
|
print('WARN: unable to load actor to obtain languages ' + person_url)
|
|
|
|
return []
|
|
|
|
return get_actor_languages_list(actor_json)
|
|
|
|
|
|
|
|
|
2022-06-01 14:26:50 +00:00
|
|
|
def set_actor_languages(actor_json: {}, languages_str: str) -> None:
|
2022-02-26 13:41:48 +00:00
|
|
|
"""Sets the languages understood by the given actor
|
2021-07-19 08:46:21 +00:00
|
|
|
"""
|
2022-02-25 15:13:23 +00:00
|
|
|
languages_str = languages_str.strip()
|
|
|
|
separator = None
|
2022-02-26 13:47:33 +00:00
|
|
|
possible_separators = (',', '/', ';', '+', ' ')
|
|
|
|
for poss in possible_separators:
|
|
|
|
if poss in languages_str:
|
|
|
|
separator = poss
|
|
|
|
break
|
2022-02-25 15:13:23 +00:00
|
|
|
if separator:
|
|
|
|
lang_list = languages_str.lower().split(separator)
|
|
|
|
else:
|
|
|
|
lang_list = [languages_str.lower()]
|
2021-12-26 10:35:37 +00:00
|
|
|
lang_list2 = ''
|
|
|
|
for lang in lang_list:
|
2021-07-19 08:46:21 +00:00
|
|
|
lang = lang.strip()
|
2022-02-26 13:41:48 +00:00
|
|
|
if lang_list2:
|
|
|
|
if ' ' + lang not in lang_list2:
|
2022-02-26 13:43:27 +00:00
|
|
|
lang_list2 += ', ' + lang
|
2022-02-26 13:41:48 +00:00
|
|
|
else:
|
2022-02-26 13:43:27 +00:00
|
|
|
lang_list2 += lang
|
2021-07-19 08:46:21 +00:00
|
|
|
|
|
|
|
# remove any existing value
|
2022-01-02 21:27:49 +00:00
|
|
|
property_found = None
|
2021-12-26 10:32:45 +00:00
|
|
|
for property_value in actor_json['attachment']:
|
2022-05-11 16:10:38 +00:00
|
|
|
name_value = None
|
|
|
|
if property_value.get('name'):
|
|
|
|
name_value = property_value['name']
|
|
|
|
elif property_value.get('schema:name'):
|
|
|
|
name_value = property_value['schema:name']
|
|
|
|
if not name_value:
|
2021-07-19 08:46:21 +00:00
|
|
|
continue
|
2021-12-26 10:32:45 +00:00
|
|
|
if not property_value.get('type'):
|
2021-07-19 08:46:21 +00:00
|
|
|
continue
|
2022-05-11 16:10:38 +00:00
|
|
|
if not name_value.lower().startswith('languages'):
|
2021-07-19 08:46:21 +00:00
|
|
|
continue
|
2022-01-02 21:27:49 +00:00
|
|
|
property_found = property_value
|
2021-07-19 08:46:21 +00:00
|
|
|
break
|
2022-01-02 21:27:49 +00:00
|
|
|
if property_found:
|
|
|
|
actor_json['attachment'].remove(property_found)
|
2021-07-19 08:46:21 +00:00
|
|
|
|
2021-12-26 10:35:37 +00:00
|
|
|
if not lang_list2:
|
2021-07-19 08:46:21 +00:00
|
|
|
return
|
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
new_languages = {
|
2021-07-19 08:46:21 +00:00
|
|
|
"name": "Languages",
|
|
|
|
"type": "PropertyValue",
|
2021-12-26 10:35:37 +00:00
|
|
|
"value": lang_list2
|
2021-07-19 08:46:21 +00:00
|
|
|
}
|
2022-01-02 21:27:49 +00:00
|
|
|
actor_json['attachment'].append(new_languages)
|
2021-07-19 08:46:21 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def understood_post_language(base_dir: str, nickname: str, domain: str,
|
|
|
|
message_json: {}, system_language: str,
|
|
|
|
http_prefix: str, domain_full: str,
|
|
|
|
person_cache: {}) -> bool:
|
2021-07-19 08:46:21 +00:00
|
|
|
"""Returns true if the post is written in a language
|
|
|
|
understood by this account
|
|
|
|
"""
|
2022-01-02 21:27:49 +00:00
|
|
|
msg_object = message_json
|
2021-12-26 10:57:03 +00:00
|
|
|
if has_object_dict(message_json):
|
2022-01-02 21:27:49 +00:00
|
|
|
msg_object = message_json['object']
|
|
|
|
if not msg_object.get('contentMap'):
|
2021-07-19 08:46:21 +00:00
|
|
|
return True
|
2022-01-02 21:27:49 +00:00
|
|
|
if not isinstance(msg_object['contentMap'], dict):
|
2021-07-19 08:46:21 +00:00
|
|
|
return True
|
2022-01-02 21:27:49 +00:00
|
|
|
if msg_object['contentMap'].get(system_language):
|
2021-07-19 08:46:21 +00:00
|
|
|
return True
|
2022-01-02 21:27:49 +00:00
|
|
|
person_url = local_actor_url(http_prefix, nickname, domain_full)
|
2021-12-29 21:55:09 +00:00
|
|
|
actor_json = \
|
2022-06-09 16:54:44 +00:00
|
|
|
get_person_from_cache(base_dir, person_url, person_cache)
|
2021-12-26 10:29:52 +00:00
|
|
|
if not actor_json:
|
2022-01-02 21:27:49 +00:00
|
|
|
print('WARN: unable to load actor to check languages ' + person_url)
|
2021-07-19 08:46:21 +00:00
|
|
|
return False
|
2021-12-26 10:52:54 +00:00
|
|
|
languages_understood = get_actor_languages_list(actor_json)
|
|
|
|
if not languages_understood:
|
2021-07-19 08:46:21 +00:00
|
|
|
return True
|
2021-12-26 10:52:54 +00:00
|
|
|
for lang in languages_understood:
|
2022-01-02 21:27:49 +00:00
|
|
|
if msg_object['contentMap'].get(lang):
|
2021-07-19 08:46:21 +00:00
|
|
|
return True
|
2021-07-19 19:40:04 +00:00
|
|
|
# is the language for this post supported by libretranslate?
|
2022-01-02 21:45:26 +00:00
|
|
|
libretranslate_url = get_config_param(base_dir, "libretranslateUrl")
|
2022-01-02 21:27:49 +00:00
|
|
|
if libretranslate_url:
|
|
|
|
libretranslate_api_key = \
|
2021-12-26 14:08:58 +00:00
|
|
|
get_config_param(base_dir, "libretranslateApiKey")
|
2021-12-26 10:35:37 +00:00
|
|
|
lang_list = \
|
2022-01-02 21:27:49 +00:00
|
|
|
libretranslate_languages(libretranslate_url,
|
|
|
|
libretranslate_api_key)
|
2021-12-26 10:35:37 +00:00
|
|
|
for lang in lang_list:
|
2022-01-02 21:27:49 +00:00
|
|
|
if msg_object['contentMap'].get(lang):
|
2021-07-19 19:40:04 +00:00
|
|
|
return True
|
2021-07-19 08:46:21 +00:00
|
|
|
return False
|
2021-07-19 19:40:04 +00:00
|
|
|
|
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
def libretranslate_languages(url: str, api_key: str = None) -> []:
|
2021-07-19 19:40:04 +00:00
|
|
|
"""Returns a list of supported languages
|
|
|
|
"""
|
2021-08-08 11:16:18 +00:00
|
|
|
if not url:
|
|
|
|
return []
|
2021-07-19 19:40:04 +00:00
|
|
|
if not url.endswith('/languages'):
|
|
|
|
if not url.endswith('/'):
|
|
|
|
url += "/languages"
|
|
|
|
else:
|
|
|
|
url += "languages"
|
|
|
|
|
2022-05-30 18:33:51 +00:00
|
|
|
params = {}
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
if api_key:
|
|
|
|
params["api_key"] = api_key
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
url_params = parse.urlencode(params)
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
req = request.Request(url, data=url_params.encode())
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-05-30 18:33:51 +00:00
|
|
|
response_str = ''
|
|
|
|
with request.urlopen(req) as response:
|
|
|
|
response_str = response.read().decode()
|
2021-07-19 19:40:04 +00:00
|
|
|
|
|
|
|
result = json.loads(response_str)
|
|
|
|
if not result:
|
|
|
|
return []
|
|
|
|
if not isinstance(result, list):
|
|
|
|
return []
|
|
|
|
|
2021-12-26 10:35:37 +00:00
|
|
|
lang_list = []
|
2021-07-19 19:40:04 +00:00
|
|
|
for lang in result:
|
|
|
|
if not isinstance(lang, dict):
|
|
|
|
continue
|
|
|
|
if not lang.get('code'):
|
|
|
|
continue
|
2022-01-02 21:27:49 +00:00
|
|
|
lang_code = lang['code']
|
|
|
|
if len(lang_code) != 2:
|
2021-07-19 19:40:04 +00:00
|
|
|
continue
|
2022-01-02 21:27:49 +00:00
|
|
|
lang_list.append(lang_code)
|
2021-12-26 10:35:37 +00:00
|
|
|
lang_list.sort()
|
|
|
|
return lang_list
|
2021-07-19 19:40:04 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def get_links_from_content(content: str) -> {}:
|
2021-07-20 10:45:04 +00:00
|
|
|
"""Returns a list of links within the given content
|
|
|
|
"""
|
|
|
|
if '<a href' not in content:
|
2021-07-20 17:49:12 +00:00
|
|
|
return {}
|
2021-07-20 10:45:04 +00:00
|
|
|
sections = content.split('<a href')
|
|
|
|
first = True
|
2021-07-20 17:49:12 +00:00
|
|
|
links = {}
|
2021-07-20 10:45:04 +00:00
|
|
|
for subsection in sections:
|
|
|
|
if first:
|
|
|
|
first = False
|
|
|
|
continue
|
|
|
|
if '"' not in subsection:
|
|
|
|
continue
|
|
|
|
url = subsection.split('"')[1].strip()
|
2021-07-20 17:49:12 +00:00
|
|
|
if '://' in url and '.' in url and \
|
|
|
|
'>' in subsection:
|
2021-07-20 10:45:04 +00:00
|
|
|
if url not in links:
|
2022-01-02 21:27:49 +00:00
|
|
|
link_text = subsection.split('>')[1]
|
|
|
|
if '<' in link_text:
|
|
|
|
link_text = link_text.split('<')[0]
|
|
|
|
links[link_text] = url
|
2021-07-20 10:45:04 +00:00
|
|
|
return links
|
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def add_links_to_content(content: str, links: {}) -> str:
|
2021-07-20 18:02:42 +00:00
|
|
|
"""Adds links back into plain text
|
|
|
|
"""
|
2022-01-02 21:27:49 +00:00
|
|
|
for link_text, url in links.items():
|
|
|
|
url_desc = url
|
|
|
|
if link_text.startswith('@') and link_text in content:
|
2021-07-20 18:02:42 +00:00
|
|
|
content = \
|
2022-01-02 21:27:49 +00:00
|
|
|
content.replace(link_text,
|
2021-07-20 18:02:42 +00:00
|
|
|
'<a href="' + url +
|
|
|
|
'" rel="nofollow noopener ' +
|
|
|
|
'noreferrer" target="_blank">' +
|
2022-01-02 21:27:49 +00:00
|
|
|
link_text + '</a>')
|
2021-07-20 18:02:42 +00:00
|
|
|
else:
|
2022-01-02 21:27:49 +00:00
|
|
|
if len(url_desc) > 40:
|
|
|
|
url_desc = url_desc[:40]
|
2021-07-20 18:02:42 +00:00
|
|
|
content += \
|
|
|
|
'<p><a href="' + url + \
|
|
|
|
'" rel="nofollow noopener noreferrer" target="_blank">' + \
|
2022-01-02 21:27:49 +00:00
|
|
|
url_desc + '</a></p>'
|
2021-07-20 18:02:42 +00:00
|
|
|
return content
|
|
|
|
|
|
|
|
|
2021-08-08 11:16:18 +00:00
|
|
|
def libretranslate(url: str, text: str,
|
2022-01-02 21:27:49 +00:00
|
|
|
source: str, target: str, api_key: str = None) -> str:
|
2021-07-19 19:40:04 +00:00
|
|
|
"""Translate string using libretranslate
|
|
|
|
"""
|
2021-08-08 11:16:18 +00:00
|
|
|
if not url:
|
|
|
|
return None
|
|
|
|
|
2021-07-19 19:40:04 +00:00
|
|
|
if not url.endswith('/translate'):
|
|
|
|
if not url.endswith('/'):
|
|
|
|
url += "/translate"
|
|
|
|
else:
|
|
|
|
url += "translate"
|
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
original_text = text
|
2021-07-20 20:12:24 +00:00
|
|
|
|
2021-07-20 10:45:04 +00:00
|
|
|
# get any links from the text
|
2021-12-29 21:55:09 +00:00
|
|
|
links = get_links_from_content(text)
|
2021-07-20 10:45:04 +00:00
|
|
|
|
2021-07-20 10:13:22 +00:00
|
|
|
# LibreTranslate doesn't like markup
|
2021-12-27 15:43:22 +00:00
|
|
|
text = remove_html(text)
|
2021-07-20 10:13:22 +00:00
|
|
|
|
2021-07-20 20:04:49 +00:00
|
|
|
# remove any links from plain text version of the content
|
2022-01-02 21:27:49 +00:00
|
|
|
for _, url2 in links.items():
|
|
|
|
text = text.replace(url2, '')
|
2021-07-20 20:04:49 +00:00
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
lt_params = {
|
2021-07-19 19:40:04 +00:00
|
|
|
"q": text,
|
|
|
|
"source": source,
|
|
|
|
"target": target
|
|
|
|
}
|
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
if api_key:
|
|
|
|
lt_params["api_key"] = api_key
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
url_params = parse.urlencode(lt_params)
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
req = request.Request(url, data=url_params.encode())
|
2022-05-30 18:33:51 +00:00
|
|
|
response_str = None
|
2021-07-20 20:09:39 +00:00
|
|
|
try:
|
2022-05-30 18:33:51 +00:00
|
|
|
with request.urlopen(req) as response:
|
|
|
|
response_str = response.read().decode()
|
|
|
|
except BaseException as ex:
|
|
|
|
print('EX: Unable to translate: ' + text + ' ' + str(ex))
|
2022-01-02 21:27:49 +00:00
|
|
|
return original_text
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-05-30 18:33:51 +00:00
|
|
|
if not response_str:
|
|
|
|
return original_text
|
2021-07-19 19:40:04 +00:00
|
|
|
|
2022-01-02 21:27:49 +00:00
|
|
|
translated_text = \
|
2021-07-20 10:45:04 +00:00
|
|
|
'<p>' + json.loads(response_str)['translatedText'] + '</p>'
|
2021-07-20 10:46:58 +00:00
|
|
|
|
|
|
|
# append links form the original text
|
2021-07-20 10:45:04 +00:00
|
|
|
if links:
|
2022-01-02 21:27:49 +00:00
|
|
|
translated_text = add_links_to_content(translated_text, links)
|
|
|
|
return translated_text
|
2021-07-19 19:40:04 +00:00
|
|
|
|
|
|
|
|
2021-12-29 21:55:09 +00:00
|
|
|
def auto_translate_post(base_dir: str, post_json_object: {},
|
|
|
|
system_language: str, translate: {}) -> str:
|
2021-07-19 19:40:04 +00:00
|
|
|
"""Tries to automatically translate the given post
|
|
|
|
"""
|
2021-12-26 10:57:03 +00:00
|
|
|
if not has_object_dict(post_json_object):
|
2021-07-19 19:40:04 +00:00
|
|
|
return ''
|
2022-01-02 21:27:49 +00:00
|
|
|
msg_object = post_json_object['object']
|
|
|
|
if not msg_object.get('contentMap'):
|
2021-07-19 19:40:04 +00:00
|
|
|
return ''
|
2022-01-02 21:27:49 +00:00
|
|
|
if not isinstance(msg_object['contentMap'], dict):
|
2021-07-19 19:40:04 +00:00
|
|
|
return ''
|
|
|
|
|
|
|
|
# is the language for this post supported by libretranslate?
|
2022-01-02 21:27:49 +00:00
|
|
|
libretranslate_url = get_config_param(base_dir, "libretranslateUrl")
|
|
|
|
if not libretranslate_url:
|
2021-07-19 19:40:04 +00:00
|
|
|
return ''
|
2022-01-02 21:27:49 +00:00
|
|
|
libretranslate_api_key = get_config_param(base_dir, "libretranslateApiKey")
|
2021-12-26 10:35:37 +00:00
|
|
|
lang_list = \
|
2022-01-02 21:27:49 +00:00
|
|
|
libretranslate_languages(libretranslate_url, libretranslate_api_key)
|
2021-12-26 10:35:37 +00:00
|
|
|
for lang in lang_list:
|
2022-05-09 17:20:05 +00:00
|
|
|
content = None
|
2022-01-02 21:27:49 +00:00
|
|
|
if msg_object['contentMap'].get(lang):
|
|
|
|
content = msg_object['contentMap'][lang]
|
2022-05-09 17:20:05 +00:00
|
|
|
if not content:
|
|
|
|
continue
|
|
|
|
translated_text = \
|
|
|
|
libretranslate(libretranslate_url, content,
|
|
|
|
lang, system_language,
|
|
|
|
libretranslate_api_key)
|
|
|
|
if translated_text:
|
|
|
|
if remove_html(translated_text) == remove_html(content):
|
|
|
|
return content
|
2022-01-02 21:27:49 +00:00
|
|
|
translated_text = \
|
2022-05-09 17:20:05 +00:00
|
|
|
'<p>' + translate['Translated'].upper() + '</p>' + \
|
|
|
|
translated_text
|
|
|
|
return translated_text
|
2021-07-19 19:40:04 +00:00
|
|
|
return ''
|