2021-07-19 08:46:21 +00:00
|
|
|
__filename__ = "languages.py"
|
|
|
|
__author__ = "Bob Mottram"
|
|
|
|
__license__ = "AGPL3+"
|
|
|
|
__version__ = "1.2.0"
|
|
|
|
__maintainer__ = "Bob Mottram"
|
2021-09-10 16:14:50 +00:00
|
|
|
__email__ = "bob@libreserver.org"
|
2021-07-19 08:46:21 +00:00
|
|
|
__status__ = "Production"
|
|
|
|
__module_group__ = "Core"
|
|
|
|
|
|
|
|
import os
|
2021-07-19 19:40:04 +00:00
|
|
|
import json
|
|
|
|
from urllib import request, parse
|
2021-07-20 13:33:27 +00:00
|
|
|
from utils import getActorLanguagesList
|
2021-07-20 10:13:22 +00:00
|
|
|
from utils import removeHtml
|
2021-07-19 19:40:04 +00:00
|
|
|
from utils import hasObjectDict
|
|
|
|
from utils import getConfigParam
|
2021-08-14 11:13:39 +00:00
|
|
|
from utils import localActorUrl
|
2021-07-19 08:46:21 +00:00
|
|
|
from cache import getPersonFromCache
|
|
|
|
|
|
|
|
|
|
|
|
def getActorLanguages(actorJson: {}) -> str:
|
|
|
|
"""Returns a string containing languages used by the given actor
|
|
|
|
"""
|
2021-07-20 13:33:27 +00:00
|
|
|
langList = getActorLanguagesList(actorJson)
|
2021-07-19 08:46:21 +00:00
|
|
|
if not langList:
|
|
|
|
return ''
|
|
|
|
languagesStr = ''
|
|
|
|
for lang in langList:
|
|
|
|
if languagesStr:
|
|
|
|
languagesStr += ' / ' + lang
|
|
|
|
else:
|
|
|
|
languagesStr = lang
|
|
|
|
return languagesStr
|
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def setActorLanguages(base_dir: str, actorJson: {}, languagesStr: str) -> None:
|
2021-07-19 08:46:21 +00:00
|
|
|
"""Sets the languages used by the given actor
|
|
|
|
"""
|
|
|
|
separator = ','
|
|
|
|
if '/' in languagesStr:
|
|
|
|
separator = '/'
|
2021-08-11 09:04:40 +00:00
|
|
|
elif ',' in languagesStr:
|
|
|
|
separator = ','
|
2021-07-19 08:46:21 +00:00
|
|
|
elif ';' in languagesStr:
|
|
|
|
separator = ';'
|
2021-08-11 09:04:40 +00:00
|
|
|
elif '+' in languagesStr:
|
|
|
|
separator = '+'
|
|
|
|
elif ' ' in languagesStr:
|
|
|
|
separator = ' '
|
2021-07-19 08:46:21 +00:00
|
|
|
langList = languagesStr.lower().split(separator)
|
2021-08-11 09:15:46 +00:00
|
|
|
langList2 = ''
|
2021-07-19 08:46:21 +00:00
|
|
|
for lang in langList:
|
|
|
|
lang = lang.strip()
|
2021-12-25 16:17:53 +00:00
|
|
|
if base_dir:
|
|
|
|
languageFilename = base_dir + '/translations/' + lang + '.json'
|
2021-07-19 10:07:29 +00:00
|
|
|
if os.path.isfile(languageFilename):
|
2021-08-11 09:15:46 +00:00
|
|
|
if langList2:
|
|
|
|
langList2 += ', ' + lang.strip()
|
|
|
|
else:
|
|
|
|
langList2 += lang.strip()
|
2021-07-19 10:07:29 +00:00
|
|
|
else:
|
2021-08-11 09:15:46 +00:00
|
|
|
if langList2:
|
|
|
|
langList2 += ', ' + lang.strip()
|
|
|
|
else:
|
|
|
|
langList2 += lang.strip()
|
2021-07-19 08:46:21 +00:00
|
|
|
|
|
|
|
# remove any existing value
|
|
|
|
propertyFound = None
|
|
|
|
for propertyValue in actorJson['attachment']:
|
|
|
|
if not propertyValue.get('name'):
|
|
|
|
continue
|
|
|
|
if not propertyValue.get('type'):
|
|
|
|
continue
|
|
|
|
if not propertyValue['name'].lower().startswith('languages'):
|
|
|
|
continue
|
|
|
|
propertyFound = propertyValue
|
|
|
|
break
|
|
|
|
if propertyFound:
|
|
|
|
actorJson['attachment'].remove(propertyFound)
|
|
|
|
|
|
|
|
if not langList2:
|
|
|
|
return
|
|
|
|
|
|
|
|
newLanguages = {
|
|
|
|
"name": "Languages",
|
|
|
|
"type": "PropertyValue",
|
|
|
|
"value": langList2
|
|
|
|
}
|
|
|
|
actorJson['attachment'].append(newLanguages)
|
|
|
|
|
|
|
|
|
2021-12-25 16:17:53 +00:00
|
|
|
def understoodPostLanguage(base_dir: str, nickname: str, domain: str,
|
2021-12-25 23:03:28 +00:00
|
|
|
messageJson: {}, system_language: str,
|
2021-12-25 17:09:22 +00:00
|
|
|
http_prefix: str, domainFull: str,
|
2021-12-25 22:17:49 +00:00
|
|
|
person_cache: {}) -> bool:
|
2021-07-19 08:46:21 +00:00
|
|
|
"""Returns true if the post is written in a language
|
|
|
|
understood by this account
|
|
|
|
"""
|
|
|
|
msgObject = messageJson
|
2021-07-20 11:59:29 +00:00
|
|
|
if hasObjectDict(messageJson):
|
|
|
|
msgObject = messageJson['object']
|
2021-07-19 08:46:21 +00:00
|
|
|
if not msgObject.get('contentMap'):
|
|
|
|
return True
|
|
|
|
if not isinstance(msgObject['contentMap'], dict):
|
|
|
|
return True
|
2021-12-25 23:03:28 +00:00
|
|
|
if msgObject['contentMap'].get(system_language):
|
2021-07-19 08:46:21 +00:00
|
|
|
return True
|
2021-12-25 17:09:22 +00:00
|
|
|
personUrl = localActorUrl(http_prefix, nickname, domainFull)
|
2021-12-25 22:17:49 +00:00
|
|
|
actorJson = getPersonFromCache(base_dir, personUrl, person_cache, False)
|
2021-07-19 08:46:21 +00:00
|
|
|
if not actorJson:
|
2021-07-20 11:59:29 +00:00
|
|
|
print('WARN: unable to load actor to check languages ' + personUrl)
|
2021-07-19 08:46:21 +00:00
|
|
|
return False
|
2021-07-20 13:33:27 +00:00
|
|
|
languagesUnderstood = getActorLanguagesList(actorJson)
|
2021-07-19 08:46:21 +00:00
|
|
|
if not languagesUnderstood:
|
|
|
|
return True
|
|
|
|
for lang in languagesUnderstood:
|
|
|
|
if msgObject['contentMap'].get(lang):
|
|
|
|
return True
|
2021-07-19 19:40:04 +00:00
|
|
|
# is the language for this post supported by libretranslate?
|
2021-12-25 16:17:53 +00:00
|
|
|
libretranslateUrl = getConfigParam(base_dir, "libretranslateUrl")
|
2021-07-19 19:40:04 +00:00
|
|
|
if libretranslateUrl:
|
2021-12-25 16:17:53 +00:00
|
|
|
libretranslateApiKey = getConfigParam(base_dir, "libretranslateApiKey")
|
2021-07-19 19:40:04 +00:00
|
|
|
langList = \
|
2021-08-08 11:16:18 +00:00
|
|
|
libretranslateLanguages(libretranslateUrl, libretranslateApiKey)
|
2021-07-19 19:40:04 +00:00
|
|
|
for lang in langList:
|
|
|
|
if msgObject['contentMap'].get(lang):
|
|
|
|
return True
|
2021-07-19 08:46:21 +00:00
|
|
|
return False
|
2021-07-19 19:40:04 +00:00
|
|
|
|
|
|
|
|
2021-08-08 11:16:18 +00:00
|
|
|
def libretranslateLanguages(url: str, apiKey: str = None) -> []:
|
2021-07-19 19:40:04 +00:00
|
|
|
"""Returns a list of supported languages
|
|
|
|
"""
|
2021-08-08 11:16:18 +00:00
|
|
|
if not url:
|
|
|
|
return []
|
2021-07-19 19:40:04 +00:00
|
|
|
if not url.endswith('/languages'):
|
|
|
|
if not url.endswith('/'):
|
|
|
|
url += "/languages"
|
|
|
|
else:
|
|
|
|
url += "languages"
|
|
|
|
|
|
|
|
params = dict()
|
|
|
|
|
|
|
|
if apiKey:
|
|
|
|
params["api_key"] = apiKey
|
|
|
|
|
|
|
|
urlParams = parse.urlencode(params)
|
|
|
|
|
|
|
|
req = request.Request(url, data=urlParams.encode())
|
|
|
|
|
|
|
|
response = request.urlopen(req)
|
|
|
|
|
|
|
|
response_str = response.read().decode()
|
|
|
|
|
|
|
|
result = json.loads(response_str)
|
|
|
|
if not result:
|
|
|
|
return []
|
|
|
|
if not isinstance(result, list):
|
|
|
|
return []
|
|
|
|
|
|
|
|
langList = []
|
|
|
|
for lang in result:
|
|
|
|
if not isinstance(lang, dict):
|
|
|
|
continue
|
|
|
|
if not lang.get('code'):
|
|
|
|
continue
|
|
|
|
langCode = lang['code']
|
|
|
|
if len(langCode) != 2:
|
|
|
|
continue
|
|
|
|
langList.append(langCode)
|
|
|
|
langList.sort()
|
|
|
|
return langList
|
|
|
|
|
|
|
|
|
2021-07-20 17:49:12 +00:00
|
|
|
def getLinksFromContent(content: str) -> {}:
|
2021-07-20 10:45:04 +00:00
|
|
|
"""Returns a list of links within the given content
|
|
|
|
"""
|
|
|
|
if '<a href' not in content:
|
2021-07-20 17:49:12 +00:00
|
|
|
return {}
|
2021-07-20 10:45:04 +00:00
|
|
|
sections = content.split('<a href')
|
|
|
|
first = True
|
2021-07-20 17:49:12 +00:00
|
|
|
links = {}
|
2021-07-20 10:45:04 +00:00
|
|
|
for subsection in sections:
|
|
|
|
if first:
|
|
|
|
first = False
|
|
|
|
continue
|
|
|
|
if '"' not in subsection:
|
|
|
|
continue
|
|
|
|
url = subsection.split('"')[1].strip()
|
2021-07-20 17:49:12 +00:00
|
|
|
if '://' in url and '.' in url and \
|
|
|
|
'>' in subsection:
|
2021-07-20 10:45:04 +00:00
|
|
|
if url not in links:
|
2021-07-20 17:49:12 +00:00
|
|
|
linkText = subsection.split('>')[1]
|
|
|
|
if '<' in linkText:
|
|
|
|
linkText = linkText.split('<')[0]
|
|
|
|
links[linkText] = url
|
2021-07-20 10:45:04 +00:00
|
|
|
return links
|
|
|
|
|
|
|
|
|
2021-07-20 18:02:42 +00:00
|
|
|
def addLinksToContent(content: str, links: {}) -> str:
|
|
|
|
"""Adds links back into plain text
|
|
|
|
"""
|
|
|
|
for linkText, url in links.items():
|
|
|
|
urlDesc = url
|
|
|
|
if linkText.startswith('@') and linkText in content:
|
|
|
|
content = \
|
|
|
|
content.replace(linkText,
|
|
|
|
'<a href="' + url +
|
|
|
|
'" rel="nofollow noopener ' +
|
|
|
|
'noreferrer" target="_blank">' +
|
|
|
|
linkText + '</a>')
|
|
|
|
else:
|
|
|
|
if len(urlDesc) > 40:
|
|
|
|
urlDesc = urlDesc[:40]
|
|
|
|
content += \
|
|
|
|
'<p><a href="' + url + \
|
|
|
|
'" rel="nofollow noopener noreferrer" target="_blank">' + \
|
|
|
|
urlDesc + '</a></p>'
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
2021-08-08 11:16:18 +00:00
|
|
|
def libretranslate(url: str, text: str,
|
|
|
|
source: str, target: str, apiKey: str = None) -> str:
|
2021-07-19 19:40:04 +00:00
|
|
|
"""Translate string using libretranslate
|
|
|
|
"""
|
2021-08-08 11:16:18 +00:00
|
|
|
if not url:
|
|
|
|
return None
|
|
|
|
|
2021-07-19 19:40:04 +00:00
|
|
|
if not url.endswith('/translate'):
|
|
|
|
if not url.endswith('/'):
|
|
|
|
url += "/translate"
|
|
|
|
else:
|
|
|
|
url += "translate"
|
|
|
|
|
2021-07-20 20:09:39 +00:00
|
|
|
originalText = text
|
2021-07-20 20:12:24 +00:00
|
|
|
|
2021-07-20 10:45:04 +00:00
|
|
|
# get any links from the text
|
|
|
|
links = getLinksFromContent(text)
|
|
|
|
|
2021-07-20 10:13:22 +00:00
|
|
|
# LibreTranslate doesn't like markup
|
|
|
|
text = removeHtml(text)
|
|
|
|
|
2021-07-20 20:04:49 +00:00
|
|
|
# remove any links from plain text version of the content
|
|
|
|
for _, url in links.items():
|
|
|
|
text = text.replace(url, '')
|
|
|
|
|
2021-07-19 19:40:04 +00:00
|
|
|
ltParams = {
|
|
|
|
"q": text,
|
|
|
|
"source": source,
|
|
|
|
"target": target
|
|
|
|
}
|
|
|
|
|
|
|
|
if apiKey:
|
|
|
|
ltParams["api_key"] = apiKey
|
|
|
|
|
|
|
|
urlParams = parse.urlencode(ltParams)
|
|
|
|
|
|
|
|
req = request.Request(url, data=urlParams.encode())
|
2021-07-20 20:09:39 +00:00
|
|
|
try:
|
|
|
|
response = request.urlopen(req)
|
|
|
|
except BaseException:
|
2021-10-29 18:48:15 +00:00
|
|
|
print('EX: Unable to translate: ' + text)
|
2021-07-20 20:09:39 +00:00
|
|
|
return originalText
|
2021-07-19 19:40:04 +00:00
|
|
|
|
|
|
|
response_str = response.read().decode()
|
|
|
|
|
2021-07-20 10:45:04 +00:00
|
|
|
translatedText = \
|
|
|
|
'<p>' + json.loads(response_str)['translatedText'] + '</p>'
|
2021-07-20 10:46:58 +00:00
|
|
|
|
|
|
|
# append links form the original text
|
2021-07-20 10:45:04 +00:00
|
|
|
if links:
|
2021-07-20 18:02:42 +00:00
|
|
|
translatedText = addLinksToContent(translatedText, links)
|
2021-07-20 10:45:04 +00:00
|
|
|
return translatedText
|
2021-07-19 19:40:04 +00:00
|
|
|
|
|
|
|
|
2021-12-25 22:09:19 +00:00
|
|
|
def autoTranslatePost(base_dir: str, post_json_object: {},
|
2021-12-25 23:03:28 +00:00
|
|
|
system_language: str, translate: {}) -> str:
|
2021-07-19 19:40:04 +00:00
|
|
|
"""Tries to automatically translate the given post
|
|
|
|
"""
|
2021-12-25 22:09:19 +00:00
|
|
|
if not hasObjectDict(post_json_object):
|
2021-07-19 19:40:04 +00:00
|
|
|
return ''
|
2021-12-25 22:09:19 +00:00
|
|
|
msgObject = post_json_object['object']
|
2021-07-19 19:40:04 +00:00
|
|
|
if not msgObject.get('contentMap'):
|
|
|
|
return ''
|
|
|
|
if not isinstance(msgObject['contentMap'], dict):
|
|
|
|
return ''
|
|
|
|
|
|
|
|
# is the language for this post supported by libretranslate?
|
2021-12-25 16:17:53 +00:00
|
|
|
libretranslateUrl = getConfigParam(base_dir, "libretranslateUrl")
|
2021-07-19 19:40:04 +00:00
|
|
|
if not libretranslateUrl:
|
|
|
|
return ''
|
2021-12-25 16:17:53 +00:00
|
|
|
libretranslateApiKey = getConfigParam(base_dir, "libretranslateApiKey")
|
2021-07-19 19:40:04 +00:00
|
|
|
langList = \
|
2021-08-08 11:16:18 +00:00
|
|
|
libretranslateLanguages(libretranslateUrl, libretranslateApiKey)
|
2021-07-19 19:40:04 +00:00
|
|
|
for lang in langList:
|
|
|
|
if msgObject['contentMap'].get(lang):
|
2021-07-20 19:46:44 +00:00
|
|
|
content = msgObject['contentMap'][lang]
|
2021-07-20 11:21:15 +00:00
|
|
|
translatedText = \
|
2021-08-08 11:16:18 +00:00
|
|
|
libretranslate(libretranslateUrl, content,
|
2021-12-25 23:03:28 +00:00
|
|
|
lang, system_language,
|
2021-08-08 11:16:18 +00:00
|
|
|
libretranslateApiKey)
|
2021-07-20 11:21:15 +00:00
|
|
|
if translatedText:
|
2021-07-20 19:55:48 +00:00
|
|
|
if removeHtml(translatedText) == removeHtml(content):
|
|
|
|
return content
|
2021-07-20 11:21:15 +00:00
|
|
|
translatedText = \
|
|
|
|
'<p>' + translate['Translated'].upper() + '</p>' + \
|
|
|
|
translatedText
|
|
|
|
return translatedText
|
2021-07-19 19:40:04 +00:00
|
|
|
return ''
|