Handling of understood languages prior to automatic translation

main
Bob Mottram 2021-07-20 14:33:27 +01:00
parent bb3dee7533
commit bb3de9e173
14 changed files with 116 additions and 78 deletions

40
blog.py
View File

@ -16,6 +16,8 @@ from webapp_utils import htmlHeaderWithBlogMarkup
from webapp_utils import htmlFooter
from webapp_utils import getPostAttachmentsAsHtml
from webapp_media import addEmbeddedElements
from utils import getActorLanguagesList
from utils import getBaseContentFromPost
from utils import getContentFromPost
from utils import isAccountDir
from utils import removeHtml
@ -32,6 +34,7 @@ from utils import acctDir
from posts import createBlogsTimeline
from newswire import rss2Header
from newswire import rss2Footer
from cache import getPersonFromCache
def _noOfBlogReplies(baseDir: str, httpPrefix: str, translate: {},
@ -166,6 +169,7 @@ def _htmlBlogPostContent(authorized: bool,
handle: str, restrictToDomain: bool,
peertubeInstances: [],
systemLanguage: str,
personCache: {},
blogSeparator: str = '<hr>') -> str:
"""Returns the content for a single blog post
"""
@ -237,7 +241,15 @@ def _htmlBlogPostContent(authorized: bool,
if attachmentStr:
blogStr += '<br><center>' + attachmentStr + '</center>'
jsonContent = getContentFromPost(postJsonObject, systemLanguage)
personUrl = \
httpPrefix + '://' + domainFull + '/users/' + nickname
actorJson = \
getPersonFromCache(baseDir, personUrl, personCache, False)
languagesUnderstood = []
if actorJson:
languagesUnderstood = getActorLanguagesList(actorJson)
jsonContent = getContentFromPost(postJsonObject, systemLanguage,
languagesUnderstood)
if jsonContent:
contentStr = addEmbeddedElements(translate, jsonContent,
peertubeInstances)
@ -330,7 +342,8 @@ def _htmlBlogPostRSS2(authorized: bool,
pubDate = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
titleStr = postJsonObject['object']['summary']
rssDateStr = pubDate.strftime("%a, %d %b %Y %H:%M:%S UT")
content = getContentFromPost(postJsonObject, systemLanguage)
content = \
getBaseContentFromPost(postJsonObject, systemLanguage)
description = firstParagraphFromString(content)
rssStr = ' <item>'
rssStr += ' <title>' + titleStr + '</title>'
@ -362,7 +375,8 @@ def _htmlBlogPostRSS3(authorized: bool,
pubDate = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
titleStr = postJsonObject['object']['summary']
rssDateStr = pubDate.strftime("%a, %d %b %Y %H:%M:%S UT")
content = getContentFromPost(postJsonObject, systemLanguage)
content = \
getBaseContentFromPost(postJsonObject, systemLanguage)
description = firstParagraphFromString(content)
rssStr = 'title: ' + titleStr + '\n'
rssStr += 'link: ' + messageLink + '\n'
@ -386,7 +400,7 @@ def _htmlBlogRemoveCwButton(blogStr: str, translate: {}) -> str:
def _getSnippetFromBlogContent(postJsonObject: {}, systemLanguage: str) -> str:
"""Returns a snippet of text from the blog post as a preview
"""
content = getContentFromPost(postJsonObject, systemLanguage)
content = getBaseContentFromPost(postJsonObject, systemLanguage)
if '<p>' in content:
content = content.split('<p>', 1)[1]
if '</p>' in content:
@ -404,7 +418,7 @@ def htmlBlogPost(authorized: bool,
nickname: str, domain: str, domainFull: str,
postJsonObject: {},
peertubeInstances: [],
systemLanguage: str) -> str:
systemLanguage: str, personCache: {}) -> str:
"""Returns a html blog post
"""
blogStr = ''
@ -428,7 +442,8 @@ def htmlBlogPost(authorized: bool,
nickname, domain,
domainFull, postJsonObject,
None, False,
peertubeInstances, systemLanguage)
peertubeInstances, systemLanguage,
personCache)
# show rss links
blogStr += '<p class="rssfeed">'
@ -456,7 +471,8 @@ def htmlBlogPage(authorized: bool, session,
baseDir: str, httpPrefix: str, translate: {},
nickname: str, domain: str, port: int,
noOfItems: int, pageNumber: int,
peertubeInstances: [], systemLanguage: str) -> str:
peertubeInstances: [], systemLanguage: str,
personCache: {}) -> str:
"""Returns a html blog page containing posts
"""
if ' ' in nickname or '@' in nickname or \
@ -519,7 +535,8 @@ def htmlBlogPage(authorized: bool, session,
domainFull, item,
None, True,
peertubeInstances,
systemLanguage)
systemLanguage,
personCache)
if len(timelineJson['orderedItems']) >= noOfItems:
blogStr += navigateStr
@ -677,7 +694,8 @@ def htmlBlogView(authorized: bool,
session, baseDir: str, httpPrefix: str,
translate: {}, domain: str, port: int,
noOfItems: int,
peertubeInstances: [], systemLanguage: str) -> str:
peertubeInstances: [], systemLanguage: str,
personCache: {}) -> str:
"""Show the blog main page
"""
blogStr = ''
@ -696,7 +714,7 @@ def htmlBlogView(authorized: bool,
baseDir, httpPrefix, translate,
nickname, domain, port,
noOfItems, 1, peertubeInstances,
systemLanguage)
systemLanguage, personCache)
domainFull = getFullDomain(domain, port)
@ -840,7 +858,7 @@ def htmlEditBlog(mediaInstance: bool, translate: {},
placeholderMessage + '</label>'
messageBoxHeight = 800
contentStr = getContentFromPost(postJsonObject, systemLanguage)
contentStr = getBaseContentFromPost(postJsonObject, systemLanguage)
contentStr = contentStr.replace('<p>', '').replace('</p>', '\n')
editBlogForm += \

View File

@ -210,7 +210,7 @@ from shares import expireShares
from categories import setHashtagCategory
from languages import getActorLanguages
from languages import setActorLanguages
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import acctDir
from utils import getImageExtensionFromMimeType
from utils import getImageMimeType
@ -9825,7 +9825,8 @@ class PubServer(BaseHTTPRequestHandler):
domain, port,
maxPostsInBlogsFeed, pageNumber,
self.server.peertubeInstances,
self.server.systemLanguage)
self.server.systemLanguage,
self.server.personCache)
if msg is not None:
msg = msg.encode('utf-8')
msglen = len(msg)
@ -10955,7 +10956,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.port,
maxPostsInBlogsFeed,
self.server.peertubeInstances,
self.server.systemLanguage)
self.server.systemLanguage,
self.server.personCache)
if msg is not None:
msg = msg.encode('utf-8')
msglen = len(msg)
@ -11054,7 +11056,8 @@ class PubServer(BaseHTTPRequestHandler):
self.server.domainFull,
postJsonObject,
self.server.peertubeInstances,
self.server.systemLanguage)
self.server.systemLanguage,
self.server.personCache)
if msg is not None:
msg = msg.encode('utf-8')
msglen = len(msg)
@ -13186,8 +13189,8 @@ class PubServer(BaseHTTPRequestHandler):
return 1
if pinToProfile:
contentStr = \
getContentFromPost(messageJson,
self.server.systemLanguage)
getBaseContentFromPost(messageJson,
self.server.systemLanguage)
pinPost(self.server.baseDir,
nickname, self.server.domain, contentStr)
return 1

View File

@ -16,7 +16,7 @@ import webbrowser
import urllib.parse
from pathlib import Path
from random import randint
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import hasObjectDict
from utils import getFullDomain
from utils import isDM
@ -700,7 +700,7 @@ def _readLocalBoxPost(session, nickname: str, domain: str,
postJsonObject2['object'].get('content'):
attributedTo = postJsonObject2['object']['attributedTo']
content = \
getContentFromPost(postJsonObject2, systemLanguage)
getBaseContentFromPost(postJsonObject2, systemLanguage)
if isinstance(attributedTo, str) and content:
actor = attributedTo
nameStr += ' ' + translate['announces'] + ' ' + \
@ -725,7 +725,7 @@ def _readLocalBoxPost(session, nickname: str, domain: str,
attributedTo = postJsonObject['object']['attributedTo']
if not attributedTo:
return {}
content = getContentFromPost(postJsonObject, systemLanguage)
content = getBaseContentFromPost(postJsonObject, systemLanguage)
if not isinstance(attributedTo, str) or \
not isinstance(content, str):
return {}
@ -1048,7 +1048,7 @@ def _desktopShowBox(indent: str,
published = _formatPublished(postJsonObject['published'])
contentStr = getContentFromPost(postJsonObject, systemLanguage)
contentStr = getBaseContentFromPost(postJsonObject, systemLanguage)
content = _textOnlyContent(contentStr)
if boxName != 'dm':
if isDM(postJsonObject):
@ -2334,7 +2334,7 @@ def runDesktopClient(baseDir: str, proxyType: str, httpPrefix: str,
postJsonObject = postJsonObject2
if postJsonObject:
content = \
getContentFromPost(postJsonObject, systemLanguage)
getBaseContentFromPost(postJsonObject, systemLanguage)
messageStr, detectedLinks = \
speakableText(baseDir, content, translate)
linkOpened = False
@ -2390,8 +2390,8 @@ def runDesktopClient(baseDir: str, proxyType: str, httpPrefix: str,
print('')
if postJsonObject['object'].get('summary'):
print(postJsonObject['object']['summary'])
contentStr = getContentFromPost(postJsonObject,
systemLanguage)
contentStr = getBaseContentFromPost(postJsonObject,
systemLanguage)
print(contentStr)
print('')
sayStr = 'Confirm delete, yes or no?'

View File

@ -14,7 +14,7 @@ import time
import random
from linked_data_sig import verifyJsonSignature
from languages import understoodPostLanguage
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import acctDir
from utils import removeDomainPort
from utils import getPortFromDomain
@ -353,7 +353,7 @@ def savePostToInboxQueue(baseDir: str, httpPrefix: str,
httpHeaders: {},
postPath: str, debug: bool,
blockedCache: [], systemLanguage: str) -> str:
"""Saves the give json to the inbox queue for the person
"""Saves the given json to the inbox queue for the person
keyId specifies the actor sending the post
"""
if len(messageBytes) > 10240:
@ -416,7 +416,7 @@ def savePostToInboxQueue(baseDir: str, httpPrefix: str,
replyNickname + '@' + replyDomain)
return None
if postJsonObject['object'].get('content'):
contentStr = getContentFromPost(postJsonObject, systemLanguage)
contentStr = getBaseContentFromPost(postJsonObject, systemLanguage)
if contentStr:
if isFiltered(baseDir, nickname, domain, contentStr):
if debug:
@ -1649,7 +1649,7 @@ def _validPostContent(baseDir: str, nickname: str, domain: str,
messageJson['object']['content']):
return True
contentStr = getContentFromPost(messageJson, systemLanguage)
contentStr = getBaseContentFromPost(messageJson, systemLanguage)
if dangerousMarkup(contentStr, allowLocalNetworkAccess):
if messageJson['object'].get('id'):
print('REJECT ARBITRARY HTML: ' + messageJson['object']['id'])
@ -1951,7 +1951,7 @@ def _sendToGroupMembers(session, baseDir: str, handle: str, port: int,
sendingActorDomainFull = \
getFullDomain(sendingActorDomain, sendingActorPort)
senderStr = '@' + sendingActorNickname + '@' + sendingActorDomainFull
contentStr = getContentFromPost(postJsonObject, systemLanguage)
contentStr = getBaseContentFromPost(postJsonObject, systemLanguage)
if not contentStr.startswith(senderStr):
postJsonObject['object']['content'] = \
senderStr + ' ' + contentStr

View File

@ -10,41 +10,17 @@ __module_group__ = "Core"
import os
import json
from urllib import request, parse
from utils import getActorLanguagesList
from utils import removeHtml
from utils import acctDir
from utils import hasObjectDict
from utils import getConfigParam
from cache import getPersonFromCache
def _getActorLanguagesList(actorJson: {}) -> []:
"""Returns a list containing languages used by the given actor
"""
if not actorJson.get('attachment'):
return []
for propertyValue in actorJson['attachment']:
if not propertyValue.get('name'):
continue
if not propertyValue['name'].lower().startswith('languages'):
continue
if not propertyValue.get('type'):
continue
if not propertyValue.get('value'):
continue
if not isinstance(propertyValue['value'], list):
continue
if propertyValue['type'] != 'PropertyValue':
continue
langList = propertyValue['value']
langList.sort()
return langList
return []
def getActorLanguages(actorJson: {}) -> str:
"""Returns a string containing languages used by the given actor
"""
langList = _getActorLanguagesList(actorJson)
langList = getActorLanguagesList(actorJson)
if not langList:
return ''
languagesStr = ''
@ -121,7 +97,7 @@ def understoodPostLanguage(baseDir: str, nickname: str, domain: str,
if not actorJson:
print('WARN: unable to load actor to check languages ' + personUrl)
return False
languagesUnderstood = _getActorLanguagesList(actorJson)
languagesUnderstood = getActorLanguagesList(actorJson)
if not languagesUnderstood:
return True
for lang in languagesUnderstood:

View File

@ -13,7 +13,7 @@ import subprocess
from random import randint
from hashlib import sha1
from auth import createPassword
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import getFullDomain
from utils import getImageExtensions
from utils import getVideoExtensions
@ -38,12 +38,13 @@ def replaceYouTube(postJsonObject: {}, replacementDomain: str,
return
if not postJsonObject['object'].get('content'):
return
contentStr = getContentFromPost(postJsonObject, systemLanguage)
contentStr = getBaseContentFromPost(postJsonObject, systemLanguage)
if 'www.youtube.com' not in contentStr:
return
contentStr = contentStr.replace('www.youtube.com', replacementDomain)
postJsonObject['object']['content'] = contentStr
postJsonObject['object']['contentMap'][systemLanguage] = contentStr
if postJsonObject['object'].get('contentMap'):
postJsonObject['object']['contentMap'][systemLanguage] = contentStr
def _removeMetaData(imageFilename: str, outputFilename: str) -> None:

View File

@ -25,7 +25,7 @@ from newswire import getDictFromNewswire
from posts import createNewsPost
from posts import archivePostsForPerson
from content import validHashTag
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import removeHtml
from utils import getFullDomain
from utils import loadJson
@ -314,7 +314,7 @@ def _hashtagAdd(baseDir: str, httpPrefix: str, domainFull: str,
hashtagHtml = \
" <a href=\"" + hashtagUrl + "\" class=\"addedHashtag\" " + \
"rel=\"tag\">#<span>" + htId + "</span></a>"
content = getContentFromPost(postJsonObject, systemLanguage)
content = getBaseContentFromPost(postJsonObject, systemLanguage)
if hashtagHtml in content:
return
@ -344,7 +344,7 @@ def _hashtagRemove(httpPrefix: str, domainFull: str, postJsonObject: {},
hashtagHtml = \
"<a href=\"" + hashtagUrl + "\" class=\"addedHashtag\" " + \
"rel=\"tag\">#<span>" + htId + "</span></a>"
content = getContentFromPost(postJsonObject, systemLanguage)
content = getBaseContentFromPost(postJsonObject, systemLanguage)
if hashtagHtml in content:
content = content.replace(hashtagHtml, '').replace(' ', ' ')
postJsonObject['object']['content'] = content
@ -385,7 +385,7 @@ def _newswireHashtagProcessing(session, baseDir: str, postJsonObject: {},
# get the full text content of the post
content = ''
if postJsonObject['object'].get('content'):
content += getContentFromPost(postJsonObject, systemLanguage)
content += getBaseContentFromPost(postJsonObject, systemLanguage)
if postJsonObject['object'].get('summary'):
content += ' ' + postJsonObject['object']['summary']
content = content.lower()
@ -667,7 +667,7 @@ def _convertRSStoActivityPub(baseDir: str, httpPrefix: str,
"\" class=\"addedHashtag\" " + \
"rel=\"tag\">#<span>" + \
htId + "</span></a>"
content = getContentFromPost(blog, systemLanguage)
content = getBaseContentFromPost(blog, systemLanguage)
if hashtagHtml not in content:
if content.endswith('</p>'):
content = \

View File

@ -18,7 +18,7 @@ from datetime import timezone
from collections import OrderedDict
from utils import validPostDate
from categories import setHashtagCategory
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import hasObjectDict
from utils import firstParagraphFromString
from utils import isPublicPost
@ -963,7 +963,7 @@ def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str,
if os.path.isfile(fullPostFilename + '.votes'):
votes = loadJson(fullPostFilename + '.votes')
content = \
getContentFromPost(postJsonObject, systemLanguage)
getBaseContentFromPost(postJsonObject, systemLanguage)
description = firstParagraphFromString(content)
description = removeHtml(description)
tagsFromPost = _getHashtagsFromPost(postJsonObject)

View File

@ -16,7 +16,7 @@ from posts import outboxMessageCreateWrap
from posts import savePostToBox
from posts import sendToFollowersThread
from posts import sendToNamedAddresses
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import hasObjectDict
from utils import getLocalNetworkAddresses
from utils import getFullDomain
@ -213,7 +213,7 @@ def postMessageToOutbox(session, translate: {},
# check that the outgoing post doesn't contain any markup
# which can be used to implement exploits
if hasObjectDict(messageJson):
contentStr = getContentFromPost(messageJson, systemLanguage)
contentStr = getBaseContentFromPost(messageJson, systemLanguage)
if contentStr:
if dangerousMarkup(contentStr, allowLocalNetworkAccess):
print('POST to outbox contains dangerous markup: ' +

View File

@ -32,7 +32,7 @@ from webfinger import webfingerHandle
from httpsig import createSignedHeader
from siteactive import siteIsActive
from languages import understoodPostLanguage
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import removeDomainPort
from utils import getPortFromDomain
from utils import hasObjectDict
@ -387,7 +387,7 @@ def _getPosts(session, outboxUrl: str, maxPosts: int,
if not isPublic:
continue
content = getContentFromPost(item, systemLanguage)
content = getBaseContentFromPost(item, systemLanguage)
content = content.replace('&apos;', "'")
mentions = []
@ -565,7 +565,7 @@ def getPostDomains(session, outboxUrl: str, maxPosts: int,
break
if not hasObjectDict(item):
continue
contentStr = getContentFromPost(item, systemLanguage)
contentStr = getBaseContentFromPost(item, systemLanguage)
if contentStr:
_updateWordFrequency(contentStr, wordFrequency)
if item['object'].get('inReplyTo'):

View File

@ -28,7 +28,32 @@ invalidCharacters = (
)
def getContentFromPost(postJsonObject: {}, systemLanguage: str) -> str:
def getActorLanguagesList(actorJson: {}) -> []:
"""Returns a list containing languages used by the given actor
"""
if not actorJson.get('attachment'):
return []
for propertyValue in actorJson['attachment']:
if not propertyValue.get('name'):
continue
if not propertyValue['name'].lower().startswith('languages'):
continue
if not propertyValue.get('type'):
continue
if not propertyValue.get('value'):
continue
if not isinstance(propertyValue['value'], list):
continue
if propertyValue['type'] != 'PropertyValue':
continue
langList = propertyValue['value']
langList.sort()
return langList
return []
def getContentFromPost(postJsonObject: {}, systemLanguage: str,
languagesUnderstood: []) -> str:
"""Returns the content from the post in the given language
including searching for a matching entry within contentMap
"""
@ -43,6 +68,12 @@ def getContentFromPost(postJsonObject: {}, systemLanguage: str) -> str:
if thisPostJson['contentMap'].get(systemLanguage):
if isinstance(thisPostJson['contentMap'][systemLanguage], str):
return thisPostJson['contentMap'][systemLanguage]
else:
# is there a contentMap entry for one of
# the understood languages?
for lang in languagesUnderstood:
if thisPostJson['contentMap'].get(lang):
return thisPostJson['contentMap'][lang]
else:
if isinstance(thisPostJson['content'], str):
content = thisPostJson['content']

View File

@ -11,7 +11,7 @@ import os
from datetime import datetime
from content import removeLongWords
from content import limitRepeatedWords
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import removeHtml
from utils import locatePost
from utils import loadJson
@ -698,7 +698,7 @@ def htmlEditNewsPost(cssCache: {}, translate: {}, baseDir: str, path: str,
' <input type="text" name="newsPostTitle" value="' + \
newsPostTitle + '"><br>\n'
newsPostContent = getContentFromPost(postJsonObject, systemLanguage)
newsPostContent = getBaseContentFromPost(postJsonObject, systemLanguage)
editNewsPostForm += \
' <textarea id="message" name="editedNewsPost" ' + \
'style="height:600px" spellcheck="true">' + \

View File

@ -22,8 +22,8 @@ from posts import postIsMuted
from posts import getPersonBox
from posts import downloadAnnounce
from posts import populateRepliesJson
from utils import getActorLanguagesList
from utils import getBaseContentFromPost
from utils import getContentFromPost
from utils import hasObjectDict
from utils import updateAnnounceCollection
from utils import isPGPEncrypted
@ -1592,7 +1592,16 @@ def individualPostAsHtml(allowDownloads: bool,
postJsonObject['object']['contentMap'][systemLanguage] = \
postJsonObject['object']['content']
contentStr = getContentFromPost(postJsonObject, systemLanguage)
domainFull = getFullDomain(domain, port)
personUrl = \
httpPrefix + '://' + domainFull + '/users/' + nickname
actorJson = \
getPersonFromCache(baseDir, personUrl, personCache, False)
languagesUnderstood = []
if actorJson:
languagesUnderstood = getActorLanguagesList(actorJson)
contentStr = getBaseContentFromPost(postJsonObject, systemLanguage,
languagesUnderstood)
if not contentStr:
contentStr = \
autoTranslatePost(baseDir, postJsonObject,

View File

@ -11,7 +11,7 @@ import os
from shutil import copyfile
import urllib.parse
from datetime import datetime
from utils import getContentFromPost
from utils import getBaseContentFromPost
from utils import isAccountDir
from utils import getConfigParam
from utils import getFullDomain
@ -904,7 +904,7 @@ def rssHashtagSearch(nickname: str, domain: str, port: int,
postJsonObject['object']['summary'] + \
'</title>'
description = \
getContentFromPost(postJsonObject, systemLanguage)
getBaseContentFromPost(postJsonObject, systemLanguage)
description = firstParagraphFromString(description)
hashtagFeed += \
' <description>' + description + '</description>'