Support arxiv scientific publication references

main
Bob Mottram 2023-01-04 11:53:15 +00:00
parent 8716759208
commit 9a7b95eca4
2 changed files with 42 additions and 2 deletions

View File

@ -604,7 +604,10 @@ def add_web_links(content: str) -> str:
# if there are no prefixes then just keep the content we have
if not prefix_found:
return content
if 'arXiv:' in content or 'arx:' in content or 'arxiv:' in content:
prefix_found = True
else:
return content
content = content.replace('\r', '')
words = content.replace('\n', ' --linebreak-- ').split(' ')
@ -612,6 +615,31 @@ def add_web_links(content: str) -> str:
for wrd in words:
if ':' not in wrd:
continue
# handle arxiv scientific references
if wrd.startswith('arXiv:') or \
wrd.startswith('arx:') or \
wrd.startswith('arxiv:'):
arxiv_ref_str = wrd.split(':', 1)[1].lower()
if '.' in arxiv_ref_str:
arxiv_ref = arxiv_ref_str.split('.')
elif ':' in arxiv_ref_str:
arxiv_ref = arxiv_ref_str.split(':')
else:
continue
if len(arxiv_ref) == 2:
arxiv_day = arxiv_ref[1]
if 'v' in arxiv_day:
arxiv_day = arxiv_day.split('v')[0]
if arxiv_ref[0].isdigit() and arxiv_day.isdigit():
ref_str = arxiv_ref[0] + '.' + arxiv_ref[1]
markup = '<a href="https://arxiv.org/abs/' + \
ref_str + '" tabindex="10" ' + \
'rel="nofollow noopener noreferrer" ' + \
'target="_blank">' + \
'<span class="ellipsis">arXiv:' + ref_str + \
'</span></a>'
replace_dict[wrd] = markup
continue
# does the word begin with a prefix?
prefix_found = False
for prefix in prefixes:

View File

@ -3617,7 +3617,19 @@ def _test_web_links():
'<a href="https://somesite.net" tabindex="10" ' + \
'rel="nofollow noopener noreferrer"' + \
' target="_blank"><span class="invisible">https://' + \
'</span><span class="ellipsis">somesite.net</span></a'
'</span><span class="ellipsis">somesite.net</span></a>'
if expected_text not in linked_text:
print(expected_text + '\n')
print(linked_text)
assert expected_text in linked_text
example_text = \
'This post has an arxiv link arXiv:2212.14672 some other text'
linked_text = add_web_links(example_text)
expected_text = \
'<a href="https://arxiv.org/abs/2212.14672" tabindex="10" ' + \
'rel="nofollow noopener noreferrer"' + \
' target="_blank"><span class="ellipsis">arXiv:2212.14672</span></a>'
if expected_text not in linked_text:
print(expected_text + '\n')
print(linked_text)