epicyon/languages.py

312 lines
9.3 KiB
Python
Raw Normal View History

__filename__ = "languages.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.2.0"
__maintainer__ = "Bob Mottram"
2021-09-10 16:14:50 +00:00
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
import os
2021-07-19 19:40:04 +00:00
import json
from urllib import request, parse
2021-12-26 10:22:19 +00:00
from utils import get_actor_languages_list
2021-12-27 15:43:22 +00:00
from utils import remove_html
2021-12-26 10:57:03 +00:00
from utils import has_object_dict
2021-12-26 14:08:58 +00:00
from utils import get_config_param
2021-12-26 10:19:59 +00:00
from utils import local_actor_url
from cache import getPersonFromCache
2021-12-26 10:29:52 +00:00
def getActorLanguages(actor_json: {}) -> str:
"""Returns a string containing languages used by the given actor
"""
2021-12-26 10:35:37 +00:00
lang_list = get_actor_languages_list(actor_json)
if not lang_list:
return ''
languagesStr = ''
2021-12-26 10:35:37 +00:00
for lang in lang_list:
if languagesStr:
languagesStr += ' / ' + lang
else:
languagesStr = lang
return languagesStr
2021-12-26 10:29:52 +00:00
def setActorLanguages(base_dir: str, actor_json: {},
languagesStr: str) -> None:
"""Sets the languages used by the given actor
"""
separator = ','
if '/' in languagesStr:
separator = '/'
2021-08-11 09:04:40 +00:00
elif ',' in languagesStr:
separator = ','
elif ';' in languagesStr:
separator = ';'
2021-08-11 09:04:40 +00:00
elif '+' in languagesStr:
separator = '+'
elif ' ' in languagesStr:
separator = ' '
2021-12-26 10:35:37 +00:00
lang_list = languagesStr.lower().split(separator)
lang_list2 = ''
for lang in lang_list:
lang = lang.strip()
2021-12-25 16:17:53 +00:00
if base_dir:
languageFilename = base_dir + '/translations/' + lang + '.json'
2021-07-19 10:07:29 +00:00
if os.path.isfile(languageFilename):
2021-12-26 10:35:37 +00:00
if lang_list2:
lang_list2 += ', ' + lang.strip()
2021-08-11 09:15:46 +00:00
else:
2021-12-26 10:35:37 +00:00
lang_list2 += lang.strip()
2021-07-19 10:07:29 +00:00
else:
2021-12-26 10:35:37 +00:00
if lang_list2:
lang_list2 += ', ' + lang.strip()
2021-08-11 09:15:46 +00:00
else:
2021-12-26 10:35:37 +00:00
lang_list2 += lang.strip()
# remove any existing value
propertyFound = None
2021-12-26 10:32:45 +00:00
for property_value in actor_json['attachment']:
if not property_value.get('name'):
continue
2021-12-26 10:32:45 +00:00
if not property_value.get('type'):
continue
2021-12-26 10:32:45 +00:00
if not property_value['name'].lower().startswith('languages'):
continue
2021-12-26 10:32:45 +00:00
propertyFound = property_value
break
if propertyFound:
2021-12-26 10:29:52 +00:00
actor_json['attachment'].remove(propertyFound)
2021-12-26 10:35:37 +00:00
if not lang_list2:
return
newLanguages = {
"name": "Languages",
"type": "PropertyValue",
2021-12-26 10:35:37 +00:00
"value": lang_list2
}
2021-12-26 10:29:52 +00:00
actor_json['attachment'].append(newLanguages)
2021-12-25 16:17:53 +00:00
def understoodPostLanguage(base_dir: str, nickname: str, domain: str,
2021-12-25 23:51:19 +00:00
message_json: {}, system_language: str,
2021-12-26 10:00:46 +00:00
http_prefix: str, domain_full: str,
2021-12-25 22:17:49 +00:00
person_cache: {}) -> bool:
"""Returns true if the post is written in a language
understood by this account
"""
2021-12-25 23:51:19 +00:00
msgObject = message_json
2021-12-26 10:57:03 +00:00
if has_object_dict(message_json):
2021-12-25 23:51:19 +00:00
msgObject = message_json['object']
if not msgObject.get('contentMap'):
return True
if not isinstance(msgObject['contentMap'], dict):
return True
2021-12-25 23:03:28 +00:00
if msgObject['contentMap'].get(system_language):
return True
2021-12-26 10:19:59 +00:00
personUrl = local_actor_url(http_prefix, nickname, domain_full)
2021-12-26 10:29:52 +00:00
actor_json = getPersonFromCache(base_dir, personUrl, person_cache, False)
if not actor_json:
2021-07-20 11:59:29 +00:00
print('WARN: unable to load actor to check languages ' + personUrl)
return False
2021-12-26 10:52:54 +00:00
languages_understood = get_actor_languages_list(actor_json)
if not languages_understood:
return True
2021-12-26 10:52:54 +00:00
for lang in languages_understood:
if msgObject['contentMap'].get(lang):
return True
2021-07-19 19:40:04 +00:00
# is the language for this post supported by libretranslate?
2021-12-26 14:08:58 +00:00
libretranslateUrl = get_config_param(base_dir, "libretranslateUrl")
2021-07-19 19:40:04 +00:00
if libretranslateUrl:
2021-12-26 14:08:58 +00:00
libretranslateApiKey = \
get_config_param(base_dir, "libretranslateApiKey")
2021-12-26 10:35:37 +00:00
lang_list = \
2021-08-08 11:16:18 +00:00
libretranslateLanguages(libretranslateUrl, libretranslateApiKey)
2021-12-26 10:35:37 +00:00
for lang in lang_list:
2021-07-19 19:40:04 +00:00
if msgObject['contentMap'].get(lang):
return True
return False
2021-07-19 19:40:04 +00:00
2021-08-08 11:16:18 +00:00
def libretranslateLanguages(url: str, apiKey: str = None) -> []:
2021-07-19 19:40:04 +00:00
"""Returns a list of supported languages
"""
2021-08-08 11:16:18 +00:00
if not url:
return []
2021-07-19 19:40:04 +00:00
if not url.endswith('/languages'):
if not url.endswith('/'):
url += "/languages"
else:
url += "languages"
params = dict()
if apiKey:
params["api_key"] = apiKey
urlParams = parse.urlencode(params)
req = request.Request(url, data=urlParams.encode())
response = request.urlopen(req)
response_str = response.read().decode()
result = json.loads(response_str)
if not result:
return []
if not isinstance(result, list):
return []
2021-12-26 10:35:37 +00:00
lang_list = []
2021-07-19 19:40:04 +00:00
for lang in result:
if not isinstance(lang, dict):
continue
if not lang.get('code'):
continue
langCode = lang['code']
if len(langCode) != 2:
continue
2021-12-26 10:35:37 +00:00
lang_list.append(langCode)
lang_list.sort()
return lang_list
2021-07-19 19:40:04 +00:00
def getLinksFromContent(content: str) -> {}:
2021-07-20 10:45:04 +00:00
"""Returns a list of links within the given content
"""
if '<a href' not in content:
return {}
2021-07-20 10:45:04 +00:00
sections = content.split('<a href')
first = True
links = {}
2021-07-20 10:45:04 +00:00
for subsection in sections:
if first:
first = False
continue
if '"' not in subsection:
continue
url = subsection.split('"')[1].strip()
if '://' in url and '.' in url and \
'>' in subsection:
2021-07-20 10:45:04 +00:00
if url not in links:
linkText = subsection.split('>')[1]
if '<' in linkText:
linkText = linkText.split('<')[0]
links[linkText] = url
2021-07-20 10:45:04 +00:00
return links
2021-07-20 18:02:42 +00:00
def addLinksToContent(content: str, links: {}) -> str:
"""Adds links back into plain text
"""
for linkText, url in links.items():
urlDesc = url
if linkText.startswith('@') and linkText in content:
content = \
content.replace(linkText,
'<a href="' + url +
'" rel="nofollow noopener ' +
'noreferrer" target="_blank">' +
linkText + '</a>')
else:
if len(urlDesc) > 40:
urlDesc = urlDesc[:40]
content += \
'<p><a href="' + url + \
'" rel="nofollow noopener noreferrer" target="_blank">' + \
urlDesc + '</a></p>'
return content
2021-08-08 11:16:18 +00:00
def libretranslate(url: str, text: str,
source: str, target: str, apiKey: str = None) -> str:
2021-07-19 19:40:04 +00:00
"""Translate string using libretranslate
"""
2021-08-08 11:16:18 +00:00
if not url:
return None
2021-07-19 19:40:04 +00:00
if not url.endswith('/translate'):
if not url.endswith('/'):
url += "/translate"
else:
url += "translate"
2021-07-20 20:09:39 +00:00
originalText = text
2021-07-20 20:12:24 +00:00
2021-07-20 10:45:04 +00:00
# get any links from the text
links = getLinksFromContent(text)
# LibreTranslate doesn't like markup
2021-12-27 15:43:22 +00:00
text = remove_html(text)
2021-07-20 20:04:49 +00:00
# remove any links from plain text version of the content
for _, url in links.items():
text = text.replace(url, '')
2021-07-19 19:40:04 +00:00
ltParams = {
"q": text,
"source": source,
"target": target
}
if apiKey:
ltParams["api_key"] = apiKey
urlParams = parse.urlencode(ltParams)
req = request.Request(url, data=urlParams.encode())
2021-07-20 20:09:39 +00:00
try:
response = request.urlopen(req)
except BaseException:
2021-10-29 18:48:15 +00:00
print('EX: Unable to translate: ' + text)
2021-07-20 20:09:39 +00:00
return originalText
2021-07-19 19:40:04 +00:00
response_str = response.read().decode()
2021-07-20 10:45:04 +00:00
translatedText = \
'<p>' + json.loads(response_str)['translatedText'] + '</p>'
2021-07-20 10:46:58 +00:00
# append links form the original text
2021-07-20 10:45:04 +00:00
if links:
2021-07-20 18:02:42 +00:00
translatedText = addLinksToContent(translatedText, links)
2021-07-20 10:45:04 +00:00
return translatedText
2021-07-19 19:40:04 +00:00
2021-12-25 22:09:19 +00:00
def autoTranslatePost(base_dir: str, post_json_object: {},
2021-12-25 23:03:28 +00:00
system_language: str, translate: {}) -> str:
2021-07-19 19:40:04 +00:00
"""Tries to automatically translate the given post
"""
2021-12-26 10:57:03 +00:00
if not has_object_dict(post_json_object):
2021-07-19 19:40:04 +00:00
return ''
2021-12-25 22:09:19 +00:00
msgObject = post_json_object['object']
2021-07-19 19:40:04 +00:00
if not msgObject.get('contentMap'):
return ''
if not isinstance(msgObject['contentMap'], dict):
return ''
# is the language for this post supported by libretranslate?
2021-12-26 14:08:58 +00:00
libretranslateUrl = get_config_param(base_dir, "libretranslateUrl")
2021-07-19 19:40:04 +00:00
if not libretranslateUrl:
return ''
2021-12-26 14:08:58 +00:00
libretranslateApiKey = get_config_param(base_dir, "libretranslateApiKey")
2021-12-26 10:35:37 +00:00
lang_list = \
2021-08-08 11:16:18 +00:00
libretranslateLanguages(libretranslateUrl, libretranslateApiKey)
2021-12-26 10:35:37 +00:00
for lang in lang_list:
2021-07-19 19:40:04 +00:00
if msgObject['contentMap'].get(lang):
2021-07-20 19:46:44 +00:00
content = msgObject['contentMap'][lang]
2021-07-20 11:21:15 +00:00
translatedText = \
2021-08-08 11:16:18 +00:00
libretranslate(libretranslateUrl, content,
2021-12-25 23:03:28 +00:00
lang, system_language,
2021-08-08 11:16:18 +00:00
libretranslateApiKey)
2021-07-20 11:21:15 +00:00
if translatedText:
2021-12-27 15:43:22 +00:00
if remove_html(translatedText) == remove_html(content):
2021-07-20 19:55:48 +00:00
return content
2021-07-20 11:21:15 +00:00
translatedText = \
'<p>' + translate['Translated'].upper() + '</p>' + \
translatedText
return translatedText
2021-07-19 19:40:04 +00:00
return ''