aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-10-24 22:57:44 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-10-24 22:57:44 +0100
commit058589548a6b91350e240468f5ddaa47e7a10abf (patch)
tree95a61a78125cda30d0c1088b4f437a51da10caa2
parentRename unit test file (diff)
downloadpaper2remarkable-058589548a6b91350e240468f5ddaa47e7a10abf.tar.gz
paper2remarkable-058589548a6b91350e240468f5ddaa47e7a10abf.zip
Move paper info functionality to Informer class
-rw-r--r--paper2remarkable/__init__.py3
-rw-r--r--paper2remarkable/providers/__init__.py4
-rw-r--r--paper2remarkable/providers/_base.py100
-rw-r--r--paper2remarkable/providers/_info.py103
-rw-r--r--paper2remarkable/providers/acm.py41
-rw-r--r--paper2remarkable/providers/arxiv.py9
-rw-r--r--paper2remarkable/providers/local.py26
-rw-r--r--paper2remarkable/providers/openreview.py15
-rw-r--r--paper2remarkable/providers/pdf_url.py27
-rw-r--r--paper2remarkable/providers/pubmed.py30
-rw-r--r--paper2remarkable/providers/springer.py15
-rw-r--r--paper2remarkable/utils.py6
12 files changed, 221 insertions, 158 deletions
diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py
index e69de29..71c1105 100644
--- a/paper2remarkable/__init__.py
+++ b/paper2remarkable/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 361c11e..f6f93f9 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
from .arxiv import Arxiv
-from .pubmed import Pubmed
+from .pubmed import PubMed
from .acm import ACM
from .openreview import OpenReview
from .springer import Springer
from .local import LocalFile
from .pdf_url import PdfUrl
-providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
+providers = [Arxiv, PubMed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 8b454b0..ca6ab70 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -9,31 +9,19 @@ Copyright: 2019, G.J.J. van den Burg
"""
import abc
-import bs4
import logging
import os
import shutil
import tempfile
-import titlecase
-import unidecode
+from ._info import Informer
from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
-from ..utils import (
- check_file_is_pdf,
- clean_string,
- download_url,
- get_page_with_retry,
- upload_to_remarkable,
-)
+from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable
class Provider(metaclass=abc.ABCMeta):
""" ABC for providers of pdf sources """
- meta_author_key = "citation_author"
- meta_title_key = "citation_title"
- meta_date_key = "citation_date"
-
def __init__(
self,
verbose=False,
@@ -54,11 +42,14 @@ class Provider(metaclass=abc.ABCMeta):
self.pdfcrop_path = pdfcrop_path
self.pdftk_path = pdftk_path
self.gs_path = gs_path
+ self.informer = Informer()
- if not self.verbose:
+ # disable logging if requested
+ logging.basicConfig(level=logging.INFO)
+ if not verbose:
logging.disable()
- # Define the operations to run on the pdf. Providers can add others
+ # Define the operations to run on the pdf. Providers can add others.
self.operations = [("crop", self.crop_pdf)]
if center:
self.operations.append(("center", self.center_pdf))
@@ -84,87 +75,24 @@ class Provider(metaclass=abc.ABCMeta):
def shrink_pdf(self, filepath):
return shrink_pdf(filepath, gs_path=self.gs_path)
- def retrieve_pdf(self, src, filename):
+ def retrieve_pdf(self, pdf_url, filename):
""" Download pdf from src and save to filename """
- _, pdf_url = self.get_abs_pdf_urls(src)
+ # This must exist so that the LocalFile provider can overwrite it
download_url(pdf_url, filename)
- def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
- op = (lambda x: x) if op is None else op
- # format the author list retrieved by bs4
- return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
-
- def get_authors(self, soup):
- authors = [
- x["content"]
- for x in soup.find_all("meta", {"name": self.meta_author_key})
- ]
- return self._format_authors(authors)
-
- def get_title(self, soup):
- target = soup.find_all("meta", {"name": self.meta_title_key})
- return target[0]["content"]
-
- def _format_date(self, soup_date):
- return soup_date
-
- def get_date(self, soup):
- date = soup.find_all("meta", {"name": self.meta_date_key})[0][
- "content"
- ]
- return self._format_date(date)
-
- def get_paper_info(
- self,
- src,
- author_key="citation_author",
- title_key="citation_title",
- date_key="citation_date",
- ):
- """ Retrieve the title/author (surnames)/year information """
- abs_url, _ = self.get_abs_pdf_urls(src)
- logging.info("Getting paper info")
- page = get_page_with_retry(abs_url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- authors = self.get_authors(soup)
- title = self.get_title(soup)
- date = self.get_date(soup)
- return dict(title=title, date=date, authors=authors)
-
- def create_filename(self, info):
- """ Generate filename using the info dict or filename if provided """
- # we assume that the list of authors is surname only.
- logging.info("Generating output filename")
-
- if len(info["authors"]) > 3:
- author_part = info["authors"][0] + "_et_al"
- else:
- author_part = "_".join(info["authors"])
- author_part = clean_string(author_part)
-
- title_part = clean_string(info["title"])
- title_part = titlecase.titlecase(title_part).replace(" ", "_")
-
- year_part = info["date"].split("/")[0]
-
- name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
- name = unidecode.unidecode(name)
- logging.info("Created filename: %s" % name)
- return name
-
def run(self, src, filename=None):
- info = self.get_paper_info(src)
- clean_filename = filename or self.create_filename(info)
+ abs_url, pdf_url = self.get_abs_pdf_urls(src)
+ clean_filename = filename or self.informer.get_filename(abs_url)
tmp_filename = "paper.pdf"
self.initial_dir = os.getcwd()
with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
os.chdir(working_dir)
- self.retrieve_pdf(src, tmp_filename)
- check_file_is_pdf(tmp_filename)
+ self.retrieve_pdf(pdf_url, tmp_filename)
+ assert_file_is_pdf(tmp_filename)
intermediate_fname = tmp_filename
- for op in self.operations:
+ for opname, op in self.operations:
intermediate_fname = op(intermediate_fname)
shutil.move(intermediate_fname, clean_filename)
diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py
new file mode 100644
index 0000000..04efcb1
--- /dev/null
+++ b/paper2remarkable/providers/_info.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+"""Functionality for retrieving paper info
+"""
+
+import logging
+import titlecase
+import unidecode
+import bs4
+
+from ..utils import clean_string, get_page_with_retry
+
+
+class Informer:
+ """Base class for the informers.
+
+ The "informer" class is used to retrieve the title, authors, and year of
+ publication of the provided paper.
+
+ This base class provides the main functionality, but because various
+ outlets use different conventions to embed author, title, and publication
+ year information, we expect that individual providers will subclass this
+ class and overwrite some of the methods.
+ """
+
+ meta_author_key = "citation_author"
+ meta_title_key = "citation_title"
+ meta_date_key = "citation_date"
+
+ def __init__(self, title=None, authors=None, year=None):
+ self.title = title
+ self.authors = authors or []
+ self.year = year
+
+ def get_filename(self, abs_url):
+ """ Generate nice filename using the paper information
+
+ The provided url must be to a HTMl page where this information can be
+ found, not to the PDF file itself.
+ """
+ logging.info("Generating output filename")
+
+ # Retrieve the paper information
+ self.get_info(abs_url)
+
+ # we assume that the list of authors is surname only.
+ if len(self.authors) > 3:
+ authors = self.authors[0] + "_et_al"
+ else:
+ authors = "_".join(self.authors)
+ authors = clean_string(authors)
+
+ # Clean the title and make it titlecase
+ title = clean_string(self.title)
+ title = titlecase.titlecase(title)
+ title = title.replace(" ", "_")
+
+ year = str(self.year)
+
+ name = authors + "_-_" + title + "_" + year + ".pdf"
+ name = unidecode.unidecode(name)
+ logging.info("Created filename: %s" % name)
+ return name
+
+ def get_info(self, url):
+ logging.info("Getting paper info")
+ page = get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ self.authors = self.authors or self.get_authors(soup)
+ self.title = self.title or self.get_title(soup)
+ self.year = self.year or self.get_year(soup)
+
+ ## Title
+
+ def get_title(self, soup):
+ target = soup.find_all("meta", {"name": self.meta_title_key})
+ return target[0]["content"]
+
+ ## Authors
+
+ def get_authors(self, soup):
+ authors = [
+ x["content"]
+ for x in soup.find_all("meta", {"name": self.meta_author_key})
+ ]
+ return self._format_authors(authors)
+
+ def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
+ op = (lambda x: x) if op is None else op
+ # format the author list retrieved by bs4
+ return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
+
+ ## Year
+
+ def _format_year(self, soup_date):
+ return soup_date.split("/")[0]
+
+ def get_year(self, soup):
+ """ Retrieve the contents of the meta_date_key field and format it """
+ date = soup.find_all("meta", {"name": self.meta_date_key})[0][
+ "content"
+ ]
+ return self._format_year(date)
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
index be98e16..e14efa7 100644
--- a/paper2remarkable/providers/acm.py
+++ b/paper2remarkable/providers/acm.py
@@ -11,23 +11,38 @@ Copyright: 2019, G.J.J. van den Burg
import bs4
import re
-from . import Provider
-from ..utils import exception
+from ._base import Provider
+from ._info import Informer
+from .. import GITHUB_URL
+from ..utils import exception, get_page_with_retry
-# TODO: put this somewhere central, now multiply defined
-GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
-
-class ACM(Provider):
+class ACMInformer(Informer):
meta_author_key = "citation_authors"
+ def _format_authors(self, soup_authors):
+ op = lambda x: x[0].split(";")
+ return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
+
+ def _format_year(self, soup_date):
+ if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
+ self.warn(
+ "Couldn't extract year from ACM page, please raise an "
+ "issue on GitHub so it can be fixed: %s" % GITHUB_URL
+ )
+ return soup_date.strip().split("/")[-1]
+
+
+class ACM(Provider):
+
re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self.informer = ACMInformer()
def get_acm_pdf_url(self, url):
- page = self.get_page_with_retry(url)
+ page = get_page_with_retry(url)
soup = bs4.BeautifulSoup(page, "html.parser")
thea = None
for a in soup.find_all("a"):
@@ -60,15 +75,3 @@ class ACM(Provider):
def validate(src):
m = re.fullmatch(ACM.re_abs, src)
return not m is None
-
- def _format_authors(self, soup_authors):
- op = lambda x: x[0].split(";")
- return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
-
- def _format_date(self, soup_date):
- if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
- self.warn(
- "Couldn't extract year from ACM page, please raise an "
- "issue on GitHub so it can be fixed: %s" % GITHUB_URL
- )
- return soup_date.strip().split("/")[-1]
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index b1982f4..d950e47 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -11,11 +11,17 @@ Copyright: 2019, G.J.J. van den Burg
import os
import re
import subprocess
+import logging
+from ._info import Informer
from ._base import Provider
from ..utils import exception
+class ArxivInformer(Informer):
+ pass
+
+
class Arxiv(Provider):
re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
@@ -23,6 +29,7 @@ class Arxiv(Provider):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self.informer = ArxivInformer()
# register the dearxiv operation
self.operations.insert(0, ("dearxiv", self.dearxiv))
@@ -45,7 +52,7 @@ class Arxiv(Provider):
def dearxiv(self, input_file):
"""Remove the arXiv timestamp from a pdf"""
- self.log("Removing arXiv timestamp")
+ logging.info("Removing arXiv timestamp")
basename = os.path.splitext(input_file)[0]
uncompress_file = basename + "_uncompress.pdf"
diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py
index 68ce030..b1201d3 100644
--- a/paper2remarkable/providers/local.py
+++ b/paper2remarkable/providers/local.py
@@ -11,24 +11,28 @@ Copyright: 2019, G.J.J. van den Burg
import os
import shutil
-from . import Provider
+from ._base import Provider
+from ._info import Informer
+
+
+class LocalFileInformer(Informer):
+ def get_filenames(self, abs_url):
+ return os.path.basename(abs_url)
class LocalFile(Provider):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self.informer = LocalFileInformer()
+
+ def get_abs_pdf_url(self, url):
+ # The 'url' is the path to the local file. We use this as abs_url and
+ # pdf_url.
+ return url, url
def validate(src):
return os.path.exists(src)
- def retrieve_pdf(self, src, filename):
- source = os.path.join(self.initial_dir, src)
+ def retrieve_pdf(self, pdf_url, filename):
+ source = os.path.join(self.initial_dir, pdf_url)
shutil.copy(source, filename)
-
- def get_paper_info(self, src):
- return {"filename": src}
-
- def create_filename(self, info, filename=None):
- if not filename is None:
- return filename
- return os.path.basename(info["filename"])
diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py
index b7e1d77..bfb139d 100644
--- a/paper2remarkable/providers/openreview.py
+++ b/paper2remarkable/providers/openreview.py
@@ -10,19 +10,27 @@ Copyright: 2019, G.J.J. van den Burg
import re
-from . import Provider
+from ._base import Provider
+from ._info import Informer
from ..utils import exception
-class OpenReview(Provider):
+class OpenReviewInformer(Informer):
meta_date_key = "citation_publication_date"
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class OpenReview(Provider):
+
re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self.informer = OpenReviewInformer()
def get_abs_pdf_urls(self, url):
""" Get the pdf and abstract url from a OpenReview url """
@@ -41,6 +49,3 @@ class OpenReview(Provider):
return re.match(OpenReview.re_abs, src) or re.match(
OpenReview.re_pdf, src
)
-
- def _format_authors(self, soup_authors):
- return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
index 56427d3..f28c742 100644
--- a/paper2remarkable/providers/pdf_url.py
+++ b/paper2remarkable/providers/pdf_url.py
@@ -10,13 +10,25 @@ Copyright: 2019, G.J.J. van den Burg
import urllib
-from . import Provider
+from ._base import Provider
+from ._info import Informer
+
from ..utils import exception
+class PdfUrlInformer(Informer):
+
+ def get_filename(self, abs_url):
+ # if this is called, filename must not be provided
+ exception(
+ "Filename must be provided with PDFUrlProvider (use --filename)"
+ )
+
+
class PdfUrl(Provider):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self.informer = PdfUrlInformer()
def validate(src):
try:
@@ -24,16 +36,3 @@ class PdfUrl(Provider):
return all([result.scheme, result.netloc, result.path])
except:
return False
-
- def retrieve_pdf(self, url, filename):
- self.download_url(url, filename)
-
- def get_paper_info(self, src):
- return None
-
- def create_filename(self, info, filename=None):
- if filename is None:
- exception(
- "Filename must be provided with PDFUrlProvider (use --filename)"
- )
- return filename
diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py
index 29bdb31..ba4cca0 100644
--- a/paper2remarkable/providers/pubmed.py
+++ b/paper2remarkable/providers/pubmed.py
@@ -10,13 +10,27 @@ Copyright: 2019, G.J.J. van den Burg
import re
-from . import Provider
+from ._base import Provider
+from ._info import Informer
from ..utils import exception
-class Pubmed(Provider):
+
+class PubMedInformer(Informer):
meta_author_key = "citation_authors"
+ def _format_authors(self, soup_authors):
+ op = lambda x: x[0].split(",")
+ return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+
+ def _format_year(self, soup_date):
+ if re.match("\w+\ \d{4}", soup_date):
+ return soup_date.split(" ")[-1]
+ return soup_date.replace(" ", "_")
+
+
+class PubMed(Provider):
+
re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
re_pdf = (
"https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
@@ -24,6 +38,7 @@ class Pubmed(Provider):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self.informer = PubMedInformer()
def get_abs_pdf_urls(self, url):
"""Get the pdf and html url from a given PMC url """
@@ -39,13 +54,4 @@ class Pubmed(Provider):
return abs_url, pdf_url
def validate(src):
- return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src)
-
- def _format_authors(self, soup_authors):
- op = lambda x: x[0].split(",")
- return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
-
- def _format_date(self, soup_date):
- if re.match("\w+\ \d{4}", soup_date):
- return soup_date.split(" ")[-1]
- return soup_date.replace(" ", "_")
+ return re.match(PubMed.re_abs, src) or re.match(PubMed.re_pdf, src)
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
index ce16007..ce4acdd 100644
--- a/paper2remarkable/providers/springer.py
+++ b/paper2remarkable/providers/springer.py
@@ -11,19 +11,27 @@ Copyright: 2019, G.J.J. van den Burg
import re
import urllib
-from . import Provider
+from ._base import Provider
+from ._info import Informer
from ..utils import exception
-class Springer(Provider):
+class SpringerInformer(Informer):
meta_date_key = "citation_online_date"
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class Springer(Provider):
+
re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self.informer = SpringerInformer()
def get_abs_pdf_urls(self, url):
""" Get the pdf and abstract urls from a Springer url """
@@ -39,6 +47,3 @@ class Springer(Provider):
def validate(src):
return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
-
- def _format_authors(self, soup_authors):
- return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 15cac95..2bed231 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -17,7 +17,7 @@ import sys
import time
import unidecode
-GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
+from . import GITHUB_URL
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
@@ -47,8 +47,8 @@ def clean_string(s):
return cleaned
-def check_file_is_pdf(filename):
- """Check that a given file is a PDF file.
+def assert_file_is_pdf(filename):
+ """Assert that a given file is a PDF file.
This is done by trying to open it using PyPDF2.
"""