From 9a7b95eca4208bd53281cdff08dcda827c9c7594 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Wed, 4 Jan 2023 11:53:15 +0000 Subject: [PATCH] Support arxiv scientific publication references --- content.py | 30 +++++++++++++++++++++++++++++- tests.py | 14 +++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/content.py b/content.py index a1320d4b7..042827fb1 100644 --- a/content.py +++ b/content.py @@ -604,7 +604,10 @@ def add_web_links(content: str) -> str: # if there are no prefixes then just keep the content we have if not prefix_found: - return content + if 'arXiv:' in content or 'arx:' in content or 'arxiv:' in content: + prefix_found = True + else: + return content content = content.replace('\r', '') words = content.replace('\n', ' --linebreak-- ').split(' ') @@ -612,6 +615,31 @@ def add_web_links(content: str) -> str: for wrd in words: if ':' not in wrd: continue + # handle arxiv scientific references + if wrd.startswith('arXiv:') or \ + wrd.startswith('arx:') or \ + wrd.startswith('arxiv:'): + arxiv_ref_str = wrd.split(':', 1)[1].lower() + if '.' in arxiv_ref_str: + arxiv_ref = arxiv_ref_str.split('.') + elif ':' in arxiv_ref_str: + arxiv_ref = arxiv_ref_str.split(':') + else: + continue + if len(arxiv_ref) == 2: + arxiv_day = arxiv_ref[1] + if 'v' in arxiv_day: + arxiv_day = arxiv_day.split('v')[0] + if arxiv_ref[0].isdigit() and arxiv_day.isdigit(): + ref_str = arxiv_ref[0] + '.' + arxiv_ref[1] + markup = '' + \ + 'arXiv:' + ref_str + \ + '' + replace_dict[wrd] = markup + continue # does the word begin with a prefix? prefix_found = False for prefix in prefixes: diff --git a/tests.py b/tests.py index da05e6d37..a15bf87f2 100644 --- a/tests.py +++ b/tests.py @@ -3617,7 +3617,19 @@ def _test_web_links(): 'somesite.netsomesite.net' + if expected_text not in linked_text: + print(expected_text + '\n') + print(linked_text) + assert expected_text in linked_text + + example_text = \ + 'This post has an arxiv link arXiv:2212.14672 some other text' + linked_text = add_web_links(example_text) + expected_text = \ + 'arXiv:2212.14672' if expected_text not in linked_text: print(expected_text + '\n') print(linked_text)