diff --git a/content.py b/content.py
index 042827fb1..c80dd4229 100644
--- a/content.py
+++ b/content.py
@@ -585,6 +585,73 @@ def _shorten_linked_urls(content: str) -> str:
return content
+def _contains_doi_reference(wrd: str, replace_dict: {}) -> bool:
+ """Handle DOI scientific references
+ """
+ if not wrd.startswith('doi:') and \
+ not wrd.startswith('DOI:'):
+ return False
+
+ doi_ref_str = wrd.split(':', 1)[1]
+ doi_site = 'https://sci-hub.ru'
+ markup = '' + \
+ 'doi:' + doi_ref_str + \
+ ''
+ replace_dict[wrd] = markup
+ return True
+
+
+def _contains_arxiv_reference(wrd: str, replace_dict: {}) -> bool:
+ """Handle arxiv scientific references
+ """
+ if not wrd.startswith('arXiv:') and \
+ not wrd.startswith('arx:') and \
+ not wrd.startswith('arxiv:'):
+ return False
+
+ arxiv_ref_str = wrd.split(':', 1)[1].lower()
+ if '.' in arxiv_ref_str:
+ arxiv_ref = arxiv_ref_str.split('.')
+ elif ':' in arxiv_ref_str:
+ arxiv_ref = arxiv_ref_str.split(':')
+ else:
+ return False
+ if len(arxiv_ref) != 2:
+ return False
+ if not arxiv_ref[0].isdigit():
+ return False
+ arxiv_day = arxiv_ref[1]
+ if 'v' in arxiv_day:
+ arxiv_day = arxiv_day.split('v')[0]
+ if not arxiv_day.isdigit():
+ return False
+ ref_str = arxiv_ref[0] + '.' + arxiv_ref[1]
+ markup = '' + \
+ 'arXiv:' + ref_str + \
+ ''
+ replace_dict[wrd] = markup
+ return True
+
+
+def _contains_academic_references(content: str) -> bool:
+ """Does the given content contain academic references
+ """
+ prefixes = (
+ 'arXiv:', 'arx:', 'arxiv:',
+ 'doi:', 'DOI:'
+ )
+ for reference in prefixes:
+ if reference in content:
+ return True
+ return False
+
+
def add_web_links(content: str) -> str:
"""Adds markup for web links
"""
@@ -604,7 +671,7 @@ def add_web_links(content: str) -> str:
# if there are no prefixes then just keep the content we have
if not prefix_found:
- if 'arXiv:' in content or 'arx:' in content or 'arxiv:' in content:
+ if _contains_academic_references(content):
prefix_found = True
else:
return content
@@ -615,30 +682,9 @@ def add_web_links(content: str) -> str:
for wrd in words:
if ':' not in wrd:
continue
- # handle arxiv scientific references
- if wrd.startswith('arXiv:') or \
- wrd.startswith('arx:') or \
- wrd.startswith('arxiv:'):
- arxiv_ref_str = wrd.split(':', 1)[1].lower()
- if '.' in arxiv_ref_str:
- arxiv_ref = arxiv_ref_str.split('.')
- elif ':' in arxiv_ref_str:
- arxiv_ref = arxiv_ref_str.split(':')
- else:
- continue
- if len(arxiv_ref) == 2:
- arxiv_day = arxiv_ref[1]
- if 'v' in arxiv_day:
- arxiv_day = arxiv_day.split('v')[0]
- if arxiv_ref[0].isdigit() and arxiv_day.isdigit():
- ref_str = arxiv_ref[0] + '.' + arxiv_ref[1]
- markup = '' + \
- 'arXiv:' + ref_str + \
- ''
- replace_dict[wrd] = markup
+ if _contains_arxiv_reference(wrd, replace_dict):
+ continue
+ if _contains_doi_reference(wrd, replace_dict):
continue
# does the word begin with a prefix?
prefix_found = False
diff --git a/tests.py b/tests.py
index 2bacd9d6b..c99597fee 100644
--- a/tests.py
+++ b/tests.py
@@ -3623,6 +3623,11 @@ def _test_web_links():
print(linked_text)
assert expected_text in linked_text
+ # NOTE: it is difficult to find academic studies of the fediverse which
+ # do not in some way violate consent or embody an arrogant status
+ # quo attitude. Did all those scraped accounts agree to be part of
+ # an academic study? Did they even consider consent as an issue?
+ # It seems doubtful. We are just like algae under a microscope to them.
example_text = \
'This post has an arxiv link arXiv:2203.15752 some other text'
linked_text = add_web_links(example_text)
@@ -3635,6 +3640,20 @@ def _test_web_links():
print(linked_text)
assert expected_text in linked_text
+ example_text = \
+ 'This post has an doi link ' + \
+ 'doi:10.1109/INFCOMW.2019.8845221 some other text'
+ linked_text = add_web_links(example_text)
+ expected_text = \
+ '' + \
+ 'doi:10.1109/INFCOMW.2019.8845221'
+ if expected_text not in linked_text:
+ print(expected_text + '\n')
+ print(linked_text)
+ assert expected_text in linked_text
+
example_text = \
'This post has a very long web link\n\nhttp://' + \
'cbwebewuvfuftdiudbqd33dddbbyuef23fyug3bfhcyu2fct2' + \
diff --git a/webapp_podcast.py b/webapp_podcast.py
index 4388b4d83..e54739dbc 100644
--- a/webapp_podcast.py
+++ b/webapp_podcast.py
@@ -299,7 +299,7 @@ def html_podcast_episode(translate: {},
text_mode_banner: str, access_keys: {},
session, session_onion, session_i2p,
http_prefix: str, debug: bool) -> str:
- """Returns html for a podcast episode, giebn an item from the newswire
+ """Returns html for a podcast episode, an item from the newswire
"""
css_filename = base_dir + '/epicyon-podcast.css'
if os.path.isfile(base_dir + '/podcast.css'):