Calculate word frequencies for account info

merge-requests/30/head
Bob Mottram 2021-01-11 13:14:22 +00:00
parent 914160ef05
commit 345145927a
4 changed files with 53 additions and 14 deletions

View File

@ -564,12 +564,14 @@ if args.postDomains:
args.port = 80 args.port = 80
elif args.gnunet: elif args.gnunet:
proxyType = 'gnunet' proxyType = 'gnunet'
wordFrequency = {}
domainList = [] domainList = []
domainList = getPublicPostDomains(None, domainList = getPublicPostDomains(None,
baseDir, nickname, domain, baseDir, nickname, domain,
proxyType, args.port, proxyType, args.port,
httpPrefix, debug, httpPrefix, debug,
__version__, domainList) __version__,
wordFrequency, domainList)
for postDomain in domainList: for postDomain in domainList:
print(postDomain) print(postDomain)
sys.exit() sys.exit()
@ -602,12 +604,14 @@ if args.postDomainsBlocked:
args.port = 80 args.port = 80
elif args.gnunet: elif args.gnunet:
proxyType = 'gnunet' proxyType = 'gnunet'
wordFrequency = {}
domainList = [] domainList = []
domainList = getPublicPostDomainsBlocked(None, domainList = getPublicPostDomainsBlocked(None,
baseDir, nickname, domain, baseDir, nickname, domain,
proxyType, args.port, proxyType, args.port,
httpPrefix, debug, httpPrefix, debug,
__version__, domainList) __version__,
wordFrequency, domainList)
for postDomain in domainList: for postDomain in domainList:
print(postDomain) print(postDomain)
sys.exit() sys.exit()

View File

@ -469,6 +469,27 @@ def _getPosts(session, outboxUrl: str, maxPosts: int,
return personPosts return personPosts
def _updateWordFrequency(content: str, wordFrequency: {}) -> None:
"""Creates a dictionary containing words and the number of times
that they appear
"""
plainText = removeHtml(content)
plainText = plainText.replace('.', ' ')
plainText = plainText.replace(';', ' ')
wordsList = plainText.split(' ')
for word in wordsList:
wordLen = len(word)
if wordLen < 3:
continue
if wordLen < 4:
if word.upper() != word:
continue
if wordFrequency.get(word):
wordFrequency[word] += 1
else:
wordFrequency[word] = 1
def getPostDomains(session, outboxUrl: str, maxPosts: int, def getPostDomains(session, outboxUrl: str, maxPosts: int,
maxMentions: int, maxMentions: int,
maxEmoji: int, maxAttachments: int, maxEmoji: int, maxAttachments: int,
@ -476,7 +497,9 @@ def getPostDomains(session, outboxUrl: str, maxPosts: int,
personCache: {}, personCache: {},
debug: bool, debug: bool,
projectVersion: str, httpPrefix: str, projectVersion: str, httpPrefix: str,
domain: str, domainList=[]) -> []: domain: str,
wordFrequency: {},
domainList=[]) -> []:
"""Returns a list of domains referenced within public posts """Returns a list of domains referenced within public posts
""" """
if not outboxUrl: if not outboxUrl:
@ -503,6 +526,9 @@ def getPostDomains(session, outboxUrl: str, maxPosts: int,
continue continue
if not isinstance(item['object'], dict): if not isinstance(item['object'], dict):
continue continue
if item['object'].get('content'):
_updateWordFrequency(item['object']['content'],
wordFrequency)
if item['object'].get('inReplyTo'): if item['object'].get('inReplyTo'):
if isinstance(item['object']['inReplyTo'], str): if isinstance(item['object']['inReplyTo'], str):
postDomain, postPort = \ postDomain, postPort = \
@ -3334,7 +3360,7 @@ def getPublicPostsOfPerson(baseDir: str, nickname: str, domain: str,
def getPublicPostDomains(session, baseDir: str, nickname: str, domain: str, def getPublicPostDomains(session, baseDir: str, nickname: str, domain: str,
proxyType: str, port: int, httpPrefix: str, proxyType: str, port: int, httpPrefix: str,
debug: bool, projectVersion: str, debug: bool, projectVersion: str,
domainList=[]) -> []: wordFrequency: {}, domainList=[]) -> []:
""" Returns a list of domains referenced within public posts """ Returns a list of domains referenced within public posts
""" """
if not session: if not session:
@ -3371,7 +3397,8 @@ def getPublicPostDomains(session, baseDir: str, nickname: str, domain: str,
getPostDomains(session, personUrl, 64, maxMentions, maxEmoji, getPostDomains(session, personUrl, 64, maxMentions, maxEmoji,
maxAttachments, federationList, maxAttachments, federationList,
personCache, debug, personCache, debug,
projectVersion, httpPrefix, domain, domainList) projectVersion, httpPrefix, domain,
wordFrequency, domainList)
postDomains.sort() postDomains.sort()
return postDomains return postDomains
@ -3412,7 +3439,8 @@ def downloadFollowCollection(followType: str,
def getPublicPostInfo(session, baseDir: str, nickname: str, domain: str, def getPublicPostInfo(session, baseDir: str, nickname: str, domain: str,
proxyType: str, port: int, httpPrefix: str, proxyType: str, port: int, httpPrefix: str,
debug: bool, projectVersion: str) -> []: debug: bool, projectVersion: str,
wordFrequency: {}) -> []:
""" Returns a dict of domains referenced within public posts """ Returns a dict of domains referenced within public posts
""" """
if not session: if not session:
@ -3450,7 +3478,8 @@ def getPublicPostInfo(session, baseDir: str, nickname: str, domain: str,
getPostDomains(session, personUrl, maxPosts, maxMentions, maxEmoji, getPostDomains(session, personUrl, maxPosts, maxMentions, maxEmoji,
maxAttachments, federationList, maxAttachments, federationList,
personCache, debug, personCache, debug,
projectVersion, httpPrefix, domain, []) projectVersion, httpPrefix, domain,
wordFrequency, [])
postDomains.sort() postDomains.sort()
domainsInfo = {} domainsInfo = {}
for d in postDomains: for d in postDomains:
@ -3476,7 +3505,7 @@ def getPublicPostDomainsBlocked(session, baseDir: str,
nickname: str, domain: str, nickname: str, domain: str,
proxyType: str, port: int, httpPrefix: str, proxyType: str, port: int, httpPrefix: str,
debug: bool, projectVersion: str, debug: bool, projectVersion: str,
domainList=[]) -> []: wordFrequency: {}, domainList=[]) -> []:
""" Returns a list of domains referenced within public posts which """ Returns a list of domains referenced within public posts which
are globally blocked on this instance are globally blocked on this instance
""" """
@ -3484,7 +3513,7 @@ def getPublicPostDomainsBlocked(session, baseDir: str,
getPublicPostDomains(session, baseDir, nickname, domain, getPublicPostDomains(session, baseDir, nickname, domain,
proxyType, port, httpPrefix, proxyType, port, httpPrefix,
debug, projectVersion, debug, projectVersion,
domainList) wordFrequency, domainList)
if not postDomains: if not postDomains:
return [] return []
@ -3532,9 +3561,10 @@ def checkDomains(session, baseDir: str,
nickname: str, domain: str, nickname: str, domain: str,
proxyType: str, port: int, httpPrefix: str, proxyType: str, port: int, httpPrefix: str,
debug: bool, projectVersion: str, debug: bool, projectVersion: str,
maxBlockedDomains: int, singleCheck: bool): maxBlockedDomains: int, singleCheck: bool) -> None:
"""Checks follower accounts for references to globally blocked domains """Checks follower accounts for references to globally blocked domains
""" """
wordFrequency = {}
nonMutuals = _getNonMutualsOfPerson(baseDir, nickname, domain) nonMutuals = _getNonMutualsOfPerson(baseDir, nickname, domain)
if not nonMutuals: if not nonMutuals:
print('No non-mutual followers were found') print('No non-mutual followers were found')
@ -3558,7 +3588,8 @@ def checkDomains(session, baseDir: str,
nonMutualNickname, nonMutualNickname,
nonMutualDomain, nonMutualDomain,
proxyType, port, httpPrefix, proxyType, port, httpPrefix,
debug, projectVersion, []) debug, projectVersion,
wordFrequency, [])
if blockedDomains: if blockedDomains:
if len(blockedDomains) > maxBlockedDomains: if len(blockedDomains) > maxBlockedDomains:
followerWarningStr += handle + '\n' followerWarningStr += handle + '\n'
@ -3577,7 +3608,8 @@ def checkDomains(session, baseDir: str,
nonMutualNickname, nonMutualNickname,
nonMutualDomain, nonMutualDomain,
proxyType, port, httpPrefix, proxyType, port, httpPrefix,
debug, projectVersion, []) debug, projectVersion,
wordFrequency, [])
if blockedDomains: if blockedDomains:
print(handle) print(handle)
for d in blockedDomains: for d in blockedDomains:

View File

@ -67,11 +67,13 @@ def instancesGraph(baseDir: str, handles: str,
projectVersion, httpPrefix, projectVersion, httpPrefix,
nickname, domain, 'outbox', nickname, domain, 'outbox',
27261) 27261)
wordFrequency = {}
postDomains = \ postDomains = \
getPostDomains(session, personUrl, 64, maxMentions, maxEmoji, getPostDomains(session, personUrl, 64, maxMentions, maxEmoji,
maxAttachments, federationList, maxAttachments, federationList,
personCache, debug, personCache, debug,
projectVersion, httpPrefix, domain, []) projectVersion, httpPrefix, domain,
wordFrequency, [])
postDomains.sort() postDomains.sort()
for fedDomain in postDomains: for fedDomain in postDomains:
dotLineStr = ' "' + domain + '" -> "' + fedDomain + '";\n' dotLineStr = ' "' + domain + '" -> "' + fedDomain + '";\n'

View File

@ -97,11 +97,12 @@ def htmlAccountInfo(cssCache: {}, translate: {},
session = createSession(proxyType) session = createSession(proxyType)
wordFrequency = {}
domainDict = getPublicPostInfo(session, domainDict = getPublicPostInfo(session,
baseDir, searchNickname, searchDomain, baseDir, searchNickname, searchDomain,
proxyType, searchPort, proxyType, searchPort,
httpPrefix, debug, httpPrefix, debug,
__version__) __version__, wordFrequency)
# get a list of any blocked followers # get a list of any blocked followers
followersList = \ followersList = \