From 058589548a6b91350e240468f5ddaa47e7a10abf Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 22:57:44 +0100 Subject: Move paper info functionality to Informer class --- paper2remarkable/__init__.py | 3 + paper2remarkable/providers/__init__.py | 4 +- paper2remarkable/providers/_base.py | 100 +++++------------------------- paper2remarkable/providers/_info.py | 103 +++++++++++++++++++++++++++++++ paper2remarkable/providers/acm.py | 41 ++++++------ paper2remarkable/providers/arxiv.py | 9 ++- paper2remarkable/providers/local.py | 26 ++++---- paper2remarkable/providers/openreview.py | 15 +++-- paper2remarkable/providers/pdf_url.py | 27 ++++---- paper2remarkable/providers/pubmed.py | 30 +++++---- paper2remarkable/providers/springer.py | 15 +++-- paper2remarkable/utils.py | 6 +- 12 files changed, 221 insertions(+), 158 deletions(-) create mode 100644 paper2remarkable/providers/_info.py diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py index e69de29..71c1105 100644 --- a/paper2remarkable/__init__.py +++ b/paper2remarkable/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 361c11e..f6f93f9 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from .arxiv import Arxiv -from .pubmed import Pubmed +from .pubmed import PubMed from .acm import ACM from .openreview import OpenReview from .springer import Springer from .local import LocalFile from .pdf_url import PdfUrl -providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl] +providers = [Arxiv, PubMed, ACM, OpenReview, Springer, LocalFile, PdfUrl] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 8b454b0..ca6ab70 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -9,31 +9,19 @@ Copyright: 2019, G.J.J. van den Burg """ import abc -import bs4 import logging import os import shutil import tempfile -import titlecase -import unidecode +from ._info import Informer from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf -from ..utils import ( - check_file_is_pdf, - clean_string, - download_url, - get_page_with_retry, - upload_to_remarkable, -) +from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable class Provider(metaclass=abc.ABCMeta): """ ABC for providers of pdf sources """ - meta_author_key = "citation_author" - meta_title_key = "citation_title" - meta_date_key = "citation_date" - def __init__( self, verbose=False, @@ -54,11 +42,14 @@ class Provider(metaclass=abc.ABCMeta): self.pdfcrop_path = pdfcrop_path self.pdftk_path = pdftk_path self.gs_path = gs_path + self.informer = Informer() - if not self.verbose: + # disable logging if requested + logging.basicConfig(level=logging.INFO) + if not verbose: logging.disable() - # Define the operations to run on the pdf. Providers can add others + # Define the operations to run on the pdf. Providers can add others. self.operations = [("crop", self.crop_pdf)] if center: self.operations.append(("center", self.center_pdf)) @@ -84,87 +75,24 @@ class Provider(metaclass=abc.ABCMeta): def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) - def retrieve_pdf(self, src, filename): + def retrieve_pdf(self, pdf_url, filename): """ Download pdf from src and save to filename """ - _, pdf_url = self.get_abs_pdf_urls(src) + # This must exist so that the LocalFile provider can overwrite it download_url(pdf_url, filename) - def _format_authors(self, soup_authors, sep=",", idx=0, op=None): - op = (lambda x: x) if op is None else op - # format the author list retrieved by bs4 - return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] - - def get_authors(self, soup): - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": self.meta_author_key}) - ] - return self._format_authors(authors) - - def get_title(self, soup): - target = soup.find_all("meta", {"name": self.meta_title_key}) - return target[0]["content"] - - def _format_date(self, soup_date): - return soup_date - - def get_date(self, soup): - date = soup.find_all("meta", {"name": self.meta_date_key})[0][ - "content" - ] - return self._format_date(date) - - def get_paper_info( - self, - src, - author_key="citation_author", - title_key="citation_title", - date_key="citation_date", - ): - """ Retrieve the title/author (surnames)/year information """ - abs_url, _ = self.get_abs_pdf_urls(src) - logging.info("Getting paper info") - page = get_page_with_retry(abs_url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = self.get_authors(soup) - title = self.get_title(soup) - date = self.get_date(soup) - return dict(title=title, date=date, authors=authors) - - def create_filename(self, info): - """ Generate filename using the info dict or filename if provided """ - # we assume that the list of authors is surname only. - logging.info("Generating output filename") - - if len(info["authors"]) > 3: - author_part = info["authors"][0] + "_et_al" - else: - author_part = "_".join(info["authors"]) - author_part = clean_string(author_part) - - title_part = clean_string(info["title"]) - title_part = titlecase.titlecase(title_part).replace(" ", "_") - - year_part = info["date"].split("/")[0] - - name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" - name = unidecode.unidecode(name) - logging.info("Created filename: %s" % name) - return name - def run(self, src, filename=None): - info = self.get_paper_info(src) - clean_filename = filename or self.create_filename(info) + abs_url, pdf_url = self.get_abs_pdf_urls(src) + clean_filename = filename or self.informer.get_filename(abs_url) tmp_filename = "paper.pdf" self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: os.chdir(working_dir) - self.retrieve_pdf(src, tmp_filename) - check_file_is_pdf(tmp_filename) + self.retrieve_pdf(pdf_url, tmp_filename) + assert_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename - for op in self.operations: + for opname, op in self.operations: intermediate_fname = op(intermediate_fname) shutil.move(intermediate_fname, clean_filename) diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py new file mode 100644 index 0000000..04efcb1 --- /dev/null +++ b/paper2remarkable/providers/_info.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +"""Functionality for retrieving paper info +""" + +import logging +import titlecase +import unidecode +import bs4 + +from ..utils import clean_string, get_page_with_retry + + +class Informer: + """Base class for the informers. + + The "informer" class is used to retrieve the title, authors, and year of + publication of the provided paper. + + This base class provides the main functionality, but because various + outlets use different conventions to embed author, title, and publication + year information, we expect that individual providers will subclass this + class and overwrite some of the methods. + """ + + meta_author_key = "citation_author" + meta_title_key = "citation_title" + meta_date_key = "citation_date" + + def __init__(self, title=None, authors=None, year=None): + self.title = title + self.authors = authors or [] + self.year = year + + def get_filename(self, abs_url): + """ Generate nice filename using the paper information + + The provided url must be to a HTMl page where this information can be + found, not to the PDF file itself. + """ + logging.info("Generating output filename") + + # Retrieve the paper information + self.get_info(abs_url) + + # we assume that the list of authors is surname only. + if len(self.authors) > 3: + authors = self.authors[0] + "_et_al" + else: + authors = "_".join(self.authors) + authors = clean_string(authors) + + # Clean the title and make it titlecase + title = clean_string(self.title) + title = titlecase.titlecase(title) + title = title.replace(" ", "_") + + year = str(self.year) + + name = authors + "_-_" + title + "_" + year + ".pdf" + name = unidecode.unidecode(name) + logging.info("Created filename: %s" % name) + return name + + def get_info(self, url): + logging.info("Getting paper info") + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + self.authors = self.authors or self.get_authors(soup) + self.title = self.title or self.get_title(soup) + self.year = self.year or self.get_year(soup) + + ## Title + + def get_title(self, soup): + target = soup.find_all("meta", {"name": self.meta_title_key}) + return target[0]["content"] + + ## Authors + + def get_authors(self, soup): + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": self.meta_author_key}) + ] + return self._format_authors(authors) + + def _format_authors(self, soup_authors, sep=",", idx=0, op=None): + op = (lambda x: x) if op is None else op + # format the author list retrieved by bs4 + return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] + + ## Year + + def _format_year(self, soup_date): + return soup_date.split("/")[0] + + def get_year(self, soup): + """ Retrieve the contents of the meta_date_key field and format it """ + date = soup.find_all("meta", {"name": self.meta_date_key})[0][ + "content" + ] + return self._format_year(date) diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py index be98e16..e14efa7 100644 --- a/paper2remarkable/providers/acm.py +++ b/paper2remarkable/providers/acm.py @@ -11,23 +11,38 @@ Copyright: 2019, G.J.J. van den Burg import bs4 import re -from . import Provider -from ..utils import exception +from ._base import Provider +from ._info import Informer +from .. import GITHUB_URL +from ..utils import exception, get_page_with_retry -# TODO: put this somewhere central, now multiply defined -GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" - -class ACM(Provider): +class ACMInformer(Informer): meta_author_key = "citation_authors" + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(";") + return super()._format_authors(soup_authors, sep=",", idx=0, op=op) + + def _format_year(self, soup_date): + if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): + self.warn( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so it can be fixed: %s" % GITHUB_URL + ) + return soup_date.strip().split("/")[-1] + + +class ACM(Provider): + re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = ACMInformer() def get_acm_pdf_url(self, url): - page = self.get_page_with_retry(url) + page = get_page_with_retry(url) soup = bs4.BeautifulSoup(page, "html.parser") thea = None for a in soup.find_all("a"): @@ -60,15 +75,3 @@ class ACM(Provider): def validate(src): m = re.fullmatch(ACM.re_abs, src) return not m is None - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(";") - return super()._format_authors(soup_authors, sep=",", idx=0, op=op) - - def _format_date(self, soup_date): - if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): - self.warn( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so it can be fixed: %s" % GITHUB_URL - ) - return soup_date.strip().split("/")[-1] diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index b1982f4..d950e47 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -11,11 +11,17 @@ Copyright: 2019, G.J.J. van den Burg import os import re import subprocess +import logging +from ._info import Informer from ._base import Provider from ..utils import exception +class ArxivInformer(Informer): + pass + + class Arxiv(Provider): re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" @@ -23,6 +29,7 @@ class Arxiv(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = ArxivInformer() # register the dearxiv operation self.operations.insert(0, ("dearxiv", self.dearxiv)) @@ -45,7 +52,7 @@ class Arxiv(Provider): def dearxiv(self, input_file): """Remove the arXiv timestamp from a pdf""" - self.log("Removing arXiv timestamp") + logging.info("Removing arXiv timestamp") basename = os.path.splitext(input_file)[0] uncompress_file = basename + "_uncompress.pdf" diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py index 68ce030..b1201d3 100644 --- a/paper2remarkable/providers/local.py +++ b/paper2remarkable/providers/local.py @@ -11,24 +11,28 @@ Copyright: 2019, G.J.J. van den Burg import os import shutil -from . import Provider +from ._base import Provider +from ._info import Informer + + +class LocalFileInformer(Informer): + def get_filenames(self, abs_url): + return os.path.basename(abs_url) class LocalFile(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = LocalFileInformer() + + def get_abs_pdf_url(self, url): + # The 'url' is the path to the local file. We use this as abs_url and + # pdf_url. + return url, url def validate(src): return os.path.exists(src) - def retrieve_pdf(self, src, filename): - source = os.path.join(self.initial_dir, src) + def retrieve_pdf(self, pdf_url, filename): + source = os.path.join(self.initial_dir, pdf_url) shutil.copy(source, filename) - - def get_paper_info(self, src): - return {"filename": src} - - def create_filename(self, info, filename=None): - if not filename is None: - return filename - return os.path.basename(info["filename"]) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py index b7e1d77..bfb139d 100644 --- a/paper2remarkable/providers/openreview.py +++ b/paper2remarkable/providers/openreview.py @@ -10,19 +10,27 @@ Copyright: 2019, G.J.J. van den Burg import re -from . import Provider +from ._base import Provider +from ._info import Informer from ..utils import exception -class OpenReview(Provider): +class OpenReviewInformer(Informer): meta_date_key = "citation_publication_date" + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class OpenReview(Provider): + re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = OpenReviewInformer() def get_abs_pdf_urls(self, url): """ Get the pdf and abstract url from a OpenReview url """ @@ -41,6 +49,3 @@ class OpenReview(Provider): return re.match(OpenReview.re_abs, src) or re.match( OpenReview.re_pdf, src ) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 56427d3..f28c742 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -10,13 +10,25 @@ Copyright: 2019, G.J.J. van den Burg import urllib -from . import Provider +from ._base import Provider +from ._info import Informer + from ..utils import exception +class PdfUrlInformer(Informer): + + def get_filename(self, abs_url): + # if this is called, filename must not be provided + exception( + "Filename must be provided with PDFUrlProvider (use --filename)" + ) + + class PdfUrl(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = PdfUrlInformer() def validate(src): try: @@ -24,16 +36,3 @@ class PdfUrl(Provider): return all([result.scheme, result.netloc, result.path]) except: return False - - def retrieve_pdf(self, url, filename): - self.download_url(url, filename) - - def get_paper_info(self, src): - return None - - def create_filename(self, info, filename=None): - if filename is None: - exception( - "Filename must be provided with PDFUrlProvider (use --filename)" - ) - return filename diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py index 29bdb31..ba4cca0 100644 --- a/paper2remarkable/providers/pubmed.py +++ b/paper2remarkable/providers/pubmed.py @@ -10,13 +10,27 @@ Copyright: 2019, G.J.J. van den Burg import re -from . import Provider +from ._base import Provider +from ._info import Informer from ..utils import exception -class Pubmed(Provider): + +class PubMedInformer(Informer): meta_author_key = "citation_authors" + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(",") + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) + + def _format_year(self, soup_date): + if re.match("\w+\ \d{4}", soup_date): + return soup_date.split(" ")[-1] + return soup_date.replace(" ", "_") + + +class PubMed(Provider): + re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" re_pdf = ( "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" @@ -24,6 +38,7 @@ class Pubmed(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = PubMedInformer() def get_abs_pdf_urls(self, url): """Get the pdf and html url from a given PMC url """ @@ -39,13 +54,4 @@ class Pubmed(Provider): return abs_url, pdf_url def validate(src): - return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src) - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(",") - return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) - - def _format_date(self, soup_date): - if re.match("\w+\ \d{4}", soup_date): - return soup_date.split(" ")[-1] - return soup_date.replace(" ", "_") + return re.match(PubMed.re_abs, src) or re.match(PubMed.re_pdf, src) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index ce16007..ce4acdd 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -11,19 +11,27 @@ Copyright: 2019, G.J.J. van den Burg import re import urllib -from . import Provider +from ._base import Provider +from ._info import Informer from ..utils import exception -class Springer(Provider): +class SpringerInformer(Informer): meta_date_key = "citation_online_date" + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class Springer(Provider): + re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = SpringerInformer() def get_abs_pdf_urls(self, url): """ Get the pdf and abstract urls from a Springer url """ @@ -39,6 +47,3 @@ class Springer(Provider): def validate(src): return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 15cac95..2bed231 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -17,7 +17,7 @@ import sys import time import unidecode -GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" +from . import GITHUB_URL HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -47,8 +47,8 @@ def clean_string(s): return cleaned -def check_file_is_pdf(filename): - """Check that a given file is a PDF file. +def assert_file_is_pdf(filename): + """Assert that a given file is a PDF file. This is done by trying to open it using PyPDF2. """ -- cgit v1.2.3