diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-01-28 13:07:27 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-01-28 13:07:27 +0000 |
| commit | 2405077323435a59ce32c8c4d994f67ae505ec7c (patch) | |
| tree | 765b0e486c336b672f0f93aa248c133eb9e75a04 | |
| parent | Remove occassional redundant logging message (diff) | |
| download | paper2remarkable-2405077323435a59ce32c8c4d994f67ae505ec7c.tar.gz paper2remarkable-2405077323435a59ce32c8c4d994f67ae505ec7c.zip | |
Add support for older arXiv papers
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 18 | ||||
| -rw-r--r-- | tests/test_providers.py | 16 |
2 files changed, 29 insertions, 5 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 282eb09..913e015 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -33,8 +33,11 @@ class ArxivInformer(Informer): class Arxiv(Provider): - re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" - re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" + re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" + re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" + + re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?" + re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -45,10 +48,10 @@ class Arxiv(Provider): def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ - if re.match(self.re_abs, url): + if re.match(self.re_abs_1, url) or re.match(self.re_abs_2, url): abs_url = url pdf_url = url.replace("abs", "pdf") + ".pdf" - elif re.match(self.re_pdf, url): + elif re.match(self.re_pdf_1, url) or re.match(self.re_pdf_2, url): abs_url = url[:-4].replace("pdf", "abs") pdf_url = url else: @@ -57,7 +60,12 @@ class Arxiv(Provider): def validate(src): """Check if the url is to an arXiv page. """ - return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) + return ( + re.match(Arxiv.re_abs_1, src) + or re.match(Arxiv.re_pdf_1, src) + or re.match(Arxiv.re_abs_2, src) + or re.match(Arxiv.re_pdf_2, src) + ) def dearxiv(self, input_file): """Remove the arXiv timestamp from a pdf""" diff --git a/tests/test_providers.py b/tests/test_providers.py index 9d1882d..e256eec 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -79,6 +79,22 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_3(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/abs/math/0309285" + exp_filename = "Jackson_et_al_-_An_Algorithm_for_Optimal_Partitioning_of_Data_on_an_Interval_2003.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_arxiv_4(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/pdf/physics/0605197v1.pdf" + exp_filename = ( + "Knuth_-_Optimal_Data-Based_Binning_for_Histograms_2006.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" |
