aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2021-05-30 15:47:32 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2021-05-30 15:47:32 +0100
commit723df60cc1ad83cf33f77bb79633b55faa92b4f3 (patch)
treeabed1abf24a782361d52cfe792e158b585bfa14d
parentMinor fixes (diff)
downloadpaper2remarkable-723df60cc1ad83cf33f77bb79633b55faa92b4f3.tar.gz
paper2remarkable-723df60cc1ad83cf33f77bb79633b55faa92b4f3.zip
Add IACR provider (partially addresses #104)
-rw-r--r--paper2remarkable/providers/__init__.py2
-rw-r--r--paper2remarkable/providers/_base.py1
-rw-r--r--paper2remarkable/providers/iacr.py111
-rw-r--r--tests/test_providers.py22
-rw-r--r--tests/test_ui.py11
5 files changed, 147 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 5130147..fb12a21 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -6,6 +6,7 @@ from .arxiv import Arxiv
from .citeseerx import CiteSeerX
from .cvf import CVF
from .html import HTML
+from .iacr import IACR
from .jmlr import JMLR
from .local import LocalFile
from .nature import Nature
@@ -28,6 +29,7 @@ providers = [
Arxiv,
CiteSeerX,
CVF,
+ IACR,
JMLR,
Nature,
NBER,
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index b41390c..9357b91 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -170,6 +170,7 @@ class Provider(metaclass=abc.ABCMeta):
def uncompress_pdf(self, in_pdf, out_pdf):
""" Uncompress a pdf file """
+ logger.info("Uncompressing with {self.pdftool} ...")
if self.pdftool == "pdftk":
status = subprocess.call(
[
diff --git a/paper2remarkable/providers/iacr.py b/paper2remarkable/providers/iacr.py
new file mode 100644
index 0000000..f91d2e5
--- /dev/null
+++ b/paper2remarkable/providers/iacr.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for IACR's eprints
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import bs4
+import os
+import re
+import urllib.parse
+
+from ._info import Informer
+from ._base import Provider
+from ..exceptions import URLResolutionError
+from ..log import Logger
+from ..utils import get_page_with_retry
+
+logger = Logger()
+
+
+class IACRInformer(Informer):
+ def get_title(self, soup):
+ title = soup.find_all("title")
+ if not title:
+ logger.warning(
+ "Couldn't determine title information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ return title[0].get_text().split("-", maxsplit=1)[-1]
+
+ def get_authors(self, soup):
+ i = soup.find_all("i")
+ if not i:
+ logger.warning(
+ "Couldn't determine author information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ authors = i[0].get_text()
+ authors = authors.replace(" ", " ")
+ authors = authors.split(" and ")
+ return self._format_authors(authors, sep=" ", idx=-1)
+
+ def get_year(self, soup):
+ h2 = soup.find_all("h2")
+ if not h2:
+ logger.warning(
+ "Couldn't determine year information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ text = h2[0].get_text()
+ report = text.split(":", maxsplit=1)[-1]
+ year_num = report.strip().split(" ")[1]
+ year = year_num.split("/")[0]
+ return year
+
+
+class IACR(Provider):
+
+ re_abs = "https?://eprint.iacr.org/\d{4}/\d+$"
+ re_pdf = "https?://eprint.iacr.org/\d{4}/\d+\.pdf$"
+ re_ps = "https?://eprint.iacr.org/\d{4}/\d+\.ps$"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = IACRInformer()
+
+ def _get_doc_url(self, abs_url):
+ page = get_page_with_retry(abs_url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+
+ bb = soup.find_all("b")
+ b = next((b for b in bb if "Available format" in b.get_text()), None)
+ if b is None:
+ # Fallback
+ return abs_url + ".pdf"
+ aa = b.find_next_siblings("a")
+ a = next((a for a in aa if "PDF" in a.get_text()), None)
+ if not a is None:
+ return urllib.parse.urljoin(abs_url, a.get("href"))
+ a = next((a for a in aa if "Postscript (PS)" in a.get_text()), None)
+ if not a is None:
+ return urllib.parse.urljoin(abs_url, a.get("href"))
+ # Fallback
+ return abs_url + ".pdf"
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = self._get_doc_url(url)
+ elif re.match(self.re_pdf, url):
+ abs_url = url[: -len(".pdf")]
+ pdf_url = url
+ elif re.match(self.re_ps, url):
+ abs_url = url[: -len(".ps")]
+ pdf_url = url
+ else:
+ raise URLResolutionError("IACR", url)
+ return abs_url, pdf_url
+
+ def retrieve_pdf(self, pdf_url, filename):
+ # Bit hacky, can consider adding first-class PS support
+ tmpfilename = os.path.splitext(filename)[0] + "-tmp.pdf"
+ super().retrieve_pdf(pdf_url, tmpfilename)
+ self.rewrite_pdf(tmpfilename, out_pdf=filename)
+
+ def validate(src):
+ return re.match(IACR.re_abs, src) or re.match(IACR.re_pdf, src)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 1c1e1e6..2e112e0 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -22,6 +22,7 @@ from paper2remarkable.providers import (
CVF,
CiteSeerX,
HTML,
+ IACR,
JMLR,
LocalFile,
Nature,
@@ -478,6 +479,27 @@ class TestProviders(unittest.TestCase):
with pdf.open_outline() as outline:
assert len(outline.root) > 0
+ def test_iacr_1(self):
+ prov = IACR(upload=False, verbose=VERBOSE)
+ url = "https://eprint.iacr.org/2021/489"
+ exp = "Xu_et_al_-_ROSE_Robust_Searchable_Encryption_With_Forward_and_Backward_Security_and_Practical_Performance_2021.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_iacr_2(self):
+ prov = IACR(upload=False, verbose=VERBOSE)
+ url = "https://eprint.iacr.org/2007/474.pdf"
+ exp = "Cochran_-_Notes_on_the_Wang_Et_Al._2_63_SHA-1_Differential_Path_2007.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_iacr_3(self):
+ prov = IACR(upload=False, verbose=VERBOSE)
+ url = "http://eprint.iacr.org/1996/008"
+ exp = "Naor_Wool_-_Access_Control_and_Signatures_via_Quorum_Secret_Sharing_1996.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_ui.py b/tests/test_ui.py
index 86a3c8e..6dfdaa6 100644
--- a/tests/test_ui.py
+++ b/tests/test_ui.py
@@ -23,6 +23,7 @@ from paper2remarkable.providers import (
CiteSeerX,
CVF,
HTML,
+ IACR,
JMLR,
LocalFile,
Nature,
@@ -114,6 +115,16 @@ class TestUI(unittest.TestCase):
"https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf",
),
(
+ IACR,
+ "https://eprint.iacr.org/1999/011",
+ "https://eprint.iacr.org/1999/011",
+ ),
+ (
+ IACR,
+ "https://eprint.iacr.org/2021/685.pdf",
+ "https://eprint.iacr.org/2021/685.pdf",
+ ),
+ (
JMLR,
"https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
"https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",