Check for edited posts

main
Bob Mottram 2021-10-14 16:12:35 +01:00
parent 054f4bde8e
commit 67bc0d61f4
5 changed files with 201 additions and 7 deletions

View File

@ -24,6 +24,7 @@ from utils import containsPGPPublicKey
from utils import acctDir
from utils import isfloat
from utils import getCurrencies
from utils import removeHtml
from petnames import getPetName
@ -1133,3 +1134,42 @@ def getPriceFromString(priceStr: str) -> (str, str):
if isfloat(priceStr):
return priceStr, "EUR"
return "0.00", "EUR"
def wordsSimilarity(content1: str, content2: str, minWords: int) -> int:
"""Returns percentage similarity
"""
if content1 == content2:
return 100
content1 = removeHtml(content1).lower()
words1 = content1.split(' ')
if len(words1) < minWords:
return 0
content2 = removeHtml(content2).lower()
words2 = content2.split(' ')
if len(words2) < minWords:
return 0
histogram1 = {}
for index in range(1, len(words1)):
combinedWords = words1[index-1] + words1[index]
if histogram1.get(combinedWords):
histogram1[combinedWords] += 1
else:
histogram1[combinedWords] = 1
histogram2 = {}
for index in range(1, len(words2)):
combinedWords = words2[index-1] + words2[index]
if histogram2.get(combinedWords):
histogram2[combinedWords] += 1
else:
histogram2[combinedWords] = 1
diff = 0
for combinedWords, hits in histogram1.items():
if not histogram2.get(combinedWords):
diff += 1
else:
diff += abs(histogram2[combinedWords] - histogram1[combinedWords])
return 100 - int(diff * 100 / len(histogram1.items()))

View File

@ -13,23 +13,50 @@ from utils import acctDir
from utils import removeIdEnding
def updateConversation(baseDir: str, nickname: str, domain: str,
postJsonObject: {}) -> bool:
"""Ads a post to a conversation index in the /conversation subdirectory
def _getConversationFilename(baseDir: str, nickname: str, domain: str,
postJsonObject: {}) -> str:
"""Returns the conversation filename
"""
if not hasObjectDict(postJsonObject):
return False
return None
if not postJsonObject['object'].get('conversation'):
return False
return None
if not postJsonObject['object'].get('id'):
return False
return None
conversationDir = acctDir(baseDir, nickname, domain) + '/conversation'
if not os.path.isdir(conversationDir):
os.mkdir(conversationDir)
conversationId = postJsonObject['object']['conversation']
conversationId = conversationId.replace('/', '#')
return conversationDir + '/' + conversationId
def previousConversationPostId(baseDir: str, nickname: str, domain: str,
postJsonObject: {}) -> str:
"""Returns the previous conversation post id
"""
conversationFilename = \
_getConversationFilename(baseDir, nickname, domain, postJsonObject)
if not conversationFilename:
return False
if not os.path.isfile(conversationFilename):
return False
with open(conversationFilename, 'r') as fp:
lines = fp.readlines()
if lines:
return lines[-1].replace('\n', '')
return False
def updateConversation(baseDir: str, nickname: str, domain: str,
postJsonObject: {}) -> bool:
"""Ads a post to a conversation index in the /conversation subdirectory
"""
conversationFilename = \
_getConversationFilename(baseDir, nickname, domain, postJsonObject)
if not conversationFilename:
return False
postId = removeIdEnding(postJsonObject['object']['id'])
conversationFilename = conversationDir + '/' + conversationId
if not os.path.isfile(conversationFilename):
try:
with open(conversationFilename, 'w+') as fp:

View File

@ -78,6 +78,7 @@ from utils import isDM
from utils import isReply
from utils import hasActor
from httpsig import messageContentDigest
from posts import editedPostFilename
from posts import savePostToBox
from posts import isCreateInsideAnnounce
from posts import createDirectMessagePost
@ -2836,8 +2837,22 @@ def _inboxAfterInitial(recentPostsCache: {}, maxRecentPosts: int,
timeDiff + ' mS')
handleName = handle.split('@')[0]
# is this an edit of a previous post?
# in Mastodon "delete and redraft"
# NOTE: this must be done before updateConversation is called
editedFilename = \
editedPostFilename(baseDir, handleName, domain,
postJsonObject, debug, 300)
updateConversation(baseDir, handleName, domain, postJsonObject)
# If this was an edit then delete the previous version of the post
if editedFilename:
deletePost(baseDir, httpPrefix,
nickname, domain, editedFilename,
debug, recentPostsCache)
_inboxUpdateCalendar(baseDir, handle, postJsonObject)
storeHashTags(baseDir, handleName, postJsonObject)

View File

@ -70,6 +70,7 @@ from utils import localActorUrl
from media import attachMedia
from media import replaceYouTube
from media import replaceTwitter
from content import wordsSimilarity
from content import limitRepeatedWords
from content import tagExists
from content import removeLongWords
@ -85,6 +86,7 @@ from linked_data_sig import generateJsonSignature
from petnames import resolvePetnames
from video import convertVideoToNote
from context import getIndividualPostContext
from conversation import previousConversationPostId
def isModerator(baseDir: str, nickname: str) -> bool:
@ -4959,3 +4961,82 @@ def c2sBoxJson(baseDir: str, session,
print('DEBUG: GET c2sBoxJson success')
return boxJson
def secondsBetweenPublished(published1: str, published2: str) -> int:
"""Returns the number of seconds between two published dates
"""
try:
published1Time = \
datetime.datetime.strptime(published1, '%Y-%m-%dT%H:%M:%SZ')
except BaseException:
return -1
try:
published2Time = \
datetime.datetime.strptime(published2, '%Y-%m-%dT%H:%M:%SZ')
except BaseException:
return -1
return (published2Time - published1Time).seconds
def editedPostFilename(baseDir: str, nickname: str, domain: str,
postJsonObject: {}, debug: bool,
maxTimeDiffSeconds: int) -> str:
"""Returns the filename of the edited post
"""
if not hasObjectDict(postJsonObject):
return ''
if not postJsonObject['object'].get('published'):
return ''
if not postJsonObject['object'].get('id'):
return ''
if not postJsonObject['object'].get('content'):
return ''
prevConvPostId = \
previousConversationPostId(baseDir, nickname, domain,
postJsonObject)
if not prevConvPostId:
return ''
prevConvPostFilename = \
locatePost(baseDir, nickname, domain, prevConvPostId, False)
if not prevConvPostFilename:
return ''
prevPostJsonObject = loadJson(prevConvPostFilename, 0)
if not prevPostJsonObject:
return ''
if not hasObjectDict(prevPostJsonObject):
return ''
if not prevPostJsonObject['object'].get('published'):
return ''
if not prevPostJsonObject['object'].get('id'):
return ''
if not prevPostJsonObject['object'].get('content'):
return ''
if prevPostJsonObject['object']['id'] == postJsonObject['object']['id']:
return ''
id1 = removeIdEnding(prevPostJsonObject['object']['id'])
if '/' not in id1:
return ''
id2 = removeIdEnding(postJsonObject['object']['id'])
if '/' not in id2:
return ''
ending1 = id1.split('/')[-1]
if not ending1:
return ''
ending2 = id2.split('/')[-1]
if not ending2:
return ''
if id1.replace(ending1, '') != id2.replace(ending2, ''):
return ''
timeDiffSeconds = \
secondsBetweenPublished(prevPostJsonObject['object']['published'],
postJsonObject['object']['published'])
if timeDiffSeconds > maxTimeDiffSeconds:
return ''
if debug:
print(id2 + ' might be an edit of ' + id1)
if wordsSimilarity(prevPostJsonObject['object']['content'],
postJsonObject['object']['content'], 10) < 75:
return ''
print(id2 + ' is an edit of ' + id1)
return prevConvPostFilename

View File

@ -45,6 +45,7 @@ from posts import noOfFollowersOnDomain
from posts import groupFollowersByDomain
from posts import archivePostsForPerson
from posts import sendPostViaServer
from posts import secondsBetweenPublished
from follow import clearFollows
from follow import clearFollowers
from follow import sendFollowRequestViaServer
@ -119,6 +120,7 @@ from inbox import jsonPostAllowsComments
from inbox import validInbox
from inbox import validInboxFilenames
from categories import guessHashtagCategory
from content import wordsSimilarity
from content import getPriceFromString
from content import limitRepeatedWords
from content import switchWords
@ -5716,6 +5718,33 @@ def _testCanReplyTo(baseDir: str) -> None:
postJsonObject)
def _testSecondsBetweenPublished() -> None:
print('testSecondsBetweenPublished')
published1 = "2021-10-14T09:39:27Z"
published2 = "2021-10-14T09:41:28Z"
secondsElapsed = secondsBetweenPublished(published1, published2)
assert secondsElapsed == 121
# invalid date
published2 = "2021-10-14N09:41:28Z"
secondsElapsed = secondsBetweenPublished(published1, published2)
assert secondsElapsed == -1
def _testWordsSimilarity() -> None:
print('testWordsSimilarity')
minWords = 10
content1 = "This is the same"
content2 = "This is the same"
assert wordsSimilarity(content1, content2, minWords) == 100
content1 = "This is our world now... " + \
"the world of the electron and the switch, the beauty of the baud"
content2 = "This is our world now. " + \
"The world of the electron and the webkit, the beauty of the baud"
similarity = wordsSimilarity(content1, content2, minWords)
assert similarity > 70
def runAllTests():
baseDir = os.getcwd()
print('Running tests...')
@ -5723,6 +5752,8 @@ def runAllTests():
_translateOntology(baseDir)
_testGetPriceFromString()
_testFunctions()
_testWordsSimilarity()
_testSecondsBetweenPublished()
_testSignAndVerify()
_testDangerousSVG(baseDir)
_testCanReplyTo(baseDir)