aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-04-05 18:15:10 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-04-05 18:15:10 +0100
commit92786bc2f898575f80845f9b7111baa3d386734a (patch)
treecaeb10dd6c989ca2a3dd76ebf47db88cb58f9988
parentSimplify the code (diff)
downloadpaper2remarkable-92786bc2f898575f80845f9b7111baa3d386734a.tar.gz
paper2remarkable-92786bc2f898575f80845f9b7111baa3d386734a.zip
[WIP] rewrite to use Providers
-rwxr-xr-xarxiv2remarkable.py177
1 files changed, 140 insertions, 37 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 61a3667..a0b4a94 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -18,6 +18,7 @@ License: MIT
"""
+import abc
import PyPDF2
import argparse
import bs4
@@ -43,36 +44,135 @@ HEADERS = {
}
+class Provider(metaclass=abc.ABCMeta):
+ """ ABC for providers of pdf sources """
+
+ def __init__(self):
+ pass
+
+ @staticmethod
+ @abc.abstractmethod
+ def validate(self, src):
+ """ Validate whether ``src`` is appropriate for this provider """
+
+ @abc.abstractmethod
+ def retrieve_pdf(self, src, filename):
+ """ Download pdf from src and save to filename """
+
+ @abc.abstractmethod
+ def get_paper_info(self, src):
+ """ Retrieve the title/author (surnames)/year information """
+
+ def create_filename(self, info, filename=None):
+ """ Generate filename using the info dict or filename if provided """
+ if not filename is None:
+ return filename
+ # we assume that the list of authors is surname only.
+ logger.info("Generating output filename")
+ if len(info["authors"]) > 3:
+ author_part = info["authors"][0] + "_et_al"
+ else:
+ author_part = "_".join(info["authors"])
+ author_part = author_part.replace(" ", "_")
+ title = (
+ info["title"].replace(",", "").replace(":", "").replace(" ", "_")
+ )
+ title_part = titlecase.titlecase(title)
+ year_part = info["date"].split("/")[0]
+ return author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+
+ def run(self, src, filename=None):
+ info = get_paper_info(src)
+ clean_filename = self.create_filename(info, filename)
+ tmp_filename = "paper.pdf"
+ self.retrieve_pdf(src, tmp_filename)
+ self.check_file_is_pdf(tmp_filename)
+
+ ops = [self.dearxiv, self.crop, self.shrink]
+ intermediate_fname = tmp_filename
+ for op in ops:
+ intermediate_fname = op(tmp_filename)
+ shutil.move(intermediate_fname, clean_filename)
+ # TODO: here
+
+
+
+
+
+
+
+
+
+class ArxivProvider(Provider):
+ def __init__(self):
+ super().__init__()
+ self.abs_url = None
+ self.pdf_url = None
+
+ def get_abs_pdf_urls(self, url):
+ """Get the pdf and abs url from any given arXiv url """
+ if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
+ abs_url = url
+ pdf_url = url.replace("abs", "pdf") + ".pdf"
+ elif re.match(
+ "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url
+ ):
+ abs_url = url[:-4].replace("pdf", "abs")
+ pdf_url = url
+ else:
+ exception("Couldn't figure out arXiv urls.")
+ return abs_url, pdf_url
+
+ def validate(self, src):
+ """Check if the url is to an arXiv page.
+
+ >>> validate_url("https://arxiv.org/abs/1811.11242")
+ True
+ >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
+ True
+ >>> validate_url("http://arxiv.org/abs/1811.11242")
+ True
+ >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
+ True
+ >>> validate_url("https://arxiv.org/abs/1811.11242v1")
+ True
+ >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
+ True
+ >>> validate_url("https://gertjanvandenburg.com")
+ False
+ """
+ m = re.match(
+ "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src
+ )
+ return not m is None
+
+ def retrieve_pdf(self, src, filename):
+ """ Download the file and save as filename """
+ _, pdf_url = self.get_abs_pdf_urls(src)
+ download_url(pdf_url, filename)
+
+ def get_paper_info(self, src):
+ """ Extract the paper's authors, title, and publication year """
+ abs_url, _ = self.get_abs_pdf_urls(src)
+ logger.info("Getting paper info from arXiv")
+ page = get_page_with_retry(abs_url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ authors = [
+ x["content"]
+ for x in soup.find_all("meta", {"name": "citation_author"})
+ ]
+ authors = [x.split(",")[0].strip() for x in authors]
+ title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+ date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+ return dict(title=title, date=date, authors=authors)
+
+
def exception(msg):
print("ERROR: " + msg, file=sys.stderr)
print("Error occurred. Exiting.", file=sys.stderr)
raise SystemExit(1)
-def arxiv_url(url):
- """Check if the url is to an arXiv page.
-
- >>> validate_url("https://arxiv.org/abs/1811.11242")
- True
- >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
- True
- >>> validate_url("http://arxiv.org/abs/1811.11242")
- True
- >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
- True
- >>> validate_url("https://arxiv.org/abs/1811.11242v1")
- True
- >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
- True
- >>> validate_url("https://gertjanvandenburg.com")
- False
- """
- m = re.match(
- "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url
- )
- return not m is None
-
-
def pmc_url(url):
m = re.fullmatch(
"https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url
@@ -101,19 +201,6 @@ def check_file_is_pdf(filename):
return False
-def get_arxiv_urls(url):
- """Get the pdf and abs url from any given arXiv url """
- if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
- abs_url = url
- pdf_url = url.replace("abs", "pdf") + ".pdf"
- elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url):
- abs_url = url[:-4].replace("pdf", "abs")
- pdf_url = url
- else:
- exception("Couldn't figure out arXiv urls.")
- return pdf_url, abs_url
-
-
def get_pmc_urls(url):
"""Get the pdf and html url from a given PMC url """
if re.match(
@@ -421,6 +508,22 @@ def parse_args():
@logger.catch
+def newmain():
+ args = parse_args()
+
+ provider = next((p for p in providers if p.validate(args.input)), None)
+ if provider is None:
+ exception("Input not valid, no provider can handle this source.")
+
+ if not args.verbose:
+ logger.remove(0)
+
+ start_wd = os.getcwd()
+ with tempfile.TemporaryDirector() as working_dir:
+ provider.run(args.input)
+
+
+@logger.catch
def main():
args = parse_args()