aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--README.md27
-rwxr-xr-xarxiv2remarkable.py852
-rw-r--r--poetry.lock60
-rw-r--r--pyproject.toml1
-rw-r--r--test.py97
6 files changed, 579 insertions, 459 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c18dd8d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/README.md b/README.md
index 3b7be2a..bbda5a7 100644
--- a/README.md
+++ b/README.md
@@ -57,16 +57,18 @@ And here's an example with verbose mode enabled that shows everything the
script does:
```bash
$ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242
-2019-02-03 18:11:41.816 | INFO | __main__:download_url:106 - Downloading file at url: https://arxiv.org/pdf/1811.11242v1.pdf
-2019-02-03 18:11:46.833 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/pdf/1811.11242v1.pdf
-2019-02-03 18:11:46.835 | INFO | __main__:get_paper_info:194 - Getting paper info from arXiv
-2019-02-03 18:11:47.496 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/abs/1811.11242v1
-2019-02-03 18:11:47.508 | INFO | __main__:generate_filename:206 - Generating output filename
-2019-02-03 18:11:47.508 | INFO | __main__:dearxiv:114 - Removing arXiv timestamp
-2019-02-03 18:11:49.221 | INFO | __main__:crop_pdf:154 - Cropping pdf file
-2019-02-03 18:11:53.247 | INFO | __main__:shrink_pdf:172 - Shrinking pdf file
-2019-02-03 18:11:54.802 | INFO | __main__:upload_to_rm:218 - Starting upload to reMarkable
-2019-02-03 18:11:57.767 | INFO | __main__:upload_to_rm:223 - Upload successful.
+2019-05-30 00:38:27 - INFO - Starting ArxivProvider
+2019-05-30 00:38:27 - INFO - Getting paper info from arXiv
+2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242
+2019-05-30 00:38:27 - INFO - Generating output filename
+2019-05-30 00:38:27 - INFO - Created filename: Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf
+2019-05-30 00:38:27 - INFO - Downloading file at url: https://arxiv.org/pdf/1811.11242.pdf
+2019-05-30 00:38:32 - INFO - Downloading url: https://arxiv.org/pdf/1811.11242.pdf
+2019-05-30 00:38:32 - INFO - Removing arXiv timestamp
+2019-05-30 00:38:34 - INFO - Cropping pdf file
+2019-05-30 00:38:37 - INFO - Shrinking pdf file
+2019-05-30 00:38:38 - INFO - Starting upload to reMarkable
+2019-05-30 00:38:42 - INFO - Upload successful.
```
## Dependencies
@@ -79,21 +81,20 @@ The script requires the following external programs to be available:
- [GhostScript](https://www.ghostscript.com/)
- [rMAPI](https://github.com/juruen/rmapi)
-If these scripts are not available on the PATH variable, you can supply them
+If these scripts are not available on the ``PATH`` variable, you can supply them
with the relevant options to the script.
The script also needs the following Python packages:
- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML
- [requests](https://pypi.org/project/requests/): getting HTML
-- [loguru](https://pypi.org/project/loguru/): easy logging
- [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF
- [titlecase](https://pypi.org/project/titlecase/): fancy titles
You can use this line:
```bash
-pip install --user bs4 requests loguru PyPDF2 titlecase
+pip install --user bs4 requests PyPDF2 titlecase
```
# Notes
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 61a3667..08beaca 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -1,16 +1,11 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-"""
-Given an arXiv paper url this script:
+__version__ = "0.2.0"
+__author__ = "G.J.J. van den Burg"
-1. Downloads the paper
-2. Strips the timestamp
-3. Crops the pdf to remove unnecessary borders
-4. Shrinks the pdf to reduce the filesize
-5. Renames it using the format:
- '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf
-6. Uploads it to the reMarkable using rMapi.
+"""
+Download a paper from various sources and send it to the reMarkable.
Author: G.J.J. van den Burg
Date: 2019-02-02
@@ -19,8 +14,10 @@ License: MIT
"""
import PyPDF2
+import abc
import argparse
import bs4
+import datetime
import os
import re
import requests
@@ -32,8 +29,6 @@ import time
import titlecase
import urllib.parse
-from loguru import logger
-
GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
HEADERS = {
@@ -43,334 +38,468 @@ HEADERS = {
}
-def exception(msg):
- print("ERROR: " + msg, file=sys.stderr)
- print("Error occurred. Exiting.", file=sys.stderr)
- raise SystemExit(1)
-
-
-def arxiv_url(url):
- """Check if the url is to an arXiv page.
-
- >>> validate_url("https://arxiv.org/abs/1811.11242")
- True
- >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
- True
- >>> validate_url("http://arxiv.org/abs/1811.11242")
- True
- >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
- True
- >>> validate_url("https://arxiv.org/abs/1811.11242v1")
- True
- >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
- True
- >>> validate_url("https://gertjanvandenburg.com")
- False
- """
- m = re.match(
- "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url
- )
- return not m is None
-
+class Provider(metaclass=abc.ABCMeta):
+ """ ABC for providers of pdf sources """
-def pmc_url(url):
- m = re.fullmatch(
- "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url
- )
- return not m is None
-
-
-def acm_url(url):
- m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url)
- return not m is None
-
-
-def valid_url(url):
- try:
- result = urllib.parse.urlparse(url)
- return all([result.scheme, result.netloc, result.path])
- except:
- return False
+ def __init__(
+ self,
+ verbose=False,
+ upload=True,
+ debug=False,
+ remarkable_dir="/",
+ rmapi_path="rmapi",
+ pdfcrop_path="pdfcrop",
+ pdftk_path="pdftk",
+ gs_path="gs",
+ ):
+ self.verbose = verbose
+ self.upload = upload
+ self.debug = debug
+ self.remarkable_dir = remarkable_dir
+ self.rmapi_path = rmapi_path
+ self.pdfcrop_path = pdfcrop_path
+ self.pdftk_path = pdftk_path
+ self.gs_path = gs_path
+
+ self.log("Starting %s" % type(self).__name__)
+
+ def log(self, msg, mode="info"):
+ if not self.verbose:
+ return
+ if not mode in ["info", "warning"]:
+ raise ValueError("unknown logging mode.")
+ now = datetime.datetime.now()
+ print(
+ now.strftime("%Y-%m-%d %H:%M:%S")
+ + " - "
+ + mode.upper()
+ + " - "
+ + msg
+ )
+ def warn(self, msg):
+ self.log(msg, mode="warning")
+
+ @staticmethod
+ @abc.abstractmethod
+ def validate(src):
+ """ Validate whether ``src`` is appropriate for this provider """
+
+ @abc.abstractmethod
+ def retrieve_pdf(self, src, filename):
+ """ Download pdf from src and save to filename """
+
+ @abc.abstractmethod
+ def get_paper_info(self, src):
+ """ Retrieve the title/author (surnames)/year information """
+
+ def create_filename(self, info, filename=None):
+ """ Generate filename using the info dict or filename if provided """
+ if not filename is None:
+ return filename
+ # we assume that the list of authors is surname only.
+ self.log("Generating output filename")
+ if len(info["authors"]) > 3:
+ author_part = info["authors"][0] + "_et_al"
+ else:
+ author_part = "_".join(info["authors"])
+ author_part = author_part.replace(" ", "_")
+ title = info["title"].replace(",", "").replace(":", "")
+ title_part = titlecase.titlecase(title).replace(" ", "_")
+ year_part = info["date"].split("/")[0]
+ name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+ self.log("Created filename: %s" % name)
+ return name
+
+ def crop_pdf(self, filepath):
+ self.log("Cropping pdf file")
+ status = subprocess.call(
+ [self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ self.warn("Failed to crop the pdf file at: %s" % filepath)
+ return filepath
+ cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+ if not os.path.exists(cropped_file):
+ self.warn(
+ "Can't find cropped file '%s' where expected." % cropped_file
+ )
+ return filepath
+ return cropped_file
-def check_file_is_pdf(filename):
- try:
- PyPDF2.PdfFileReader(open(filename, "rb"))
- return True
- except PyPDF2.utils.PdfReadError:
- return False
+ def shrink_pdf(self, filepath):
+ self.log("Shrinking pdf file")
+ output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+ status = subprocess.call(
+ [
+ self.gs_path,
+ "-sDEVICE=pdfwrite",
+ "-dCompatibilityLevel=1.4",
+ "-dPDFSETTINGS=/printer",
+ "-dNOPAUSE",
+ "-dBATCH",
+ "-dQUIET",
+ "-sOutputFile=%s" % output_file,
+ filepath,
+ ]
+ )
+ if not status == 0:
+ self.warn("Failed to shrink the pdf file")
+ return filepath
+ return output_file
+ def check_file_is_pdf(self, filename):
+ try:
+ fp = open(filename, "rb")
+ pdf = PyPDF2.PdfFileReader(fp, strict=False)
+ fp.close()
+ del pdf
+ return True
+ except PyPDF2.utils.PdfReadError:
+ exception("Downloaded file isn't a valid pdf file.")
+
+ def download_url(self, url, filename):
+ """Download the content of an url and save it to a filename """
+ self.log("Downloading file at url: %s" % url)
+ content = self.get_page_with_retry(url)
+ with open(filename, "wb") as fid:
+ fid.write(content)
+
+ def get_page_with_retry(self, url, tries=5):
+ count = 0
+ while count < tries:
+ count += 1
+ error = False
+ try:
+ res = requests.get(url, headers=HEADERS)
+ except requests.exceptions.ConnectionError:
+ error = True
+ if error or not res.ok:
+ time.sleep(5)
+ self.warn("Error getting url %s. Retrying in 5 seconds" % url)
+ continue
+ self.log("Downloading url: %s" % url)
+ return res.content
-def get_arxiv_urls(url):
- """Get the pdf and abs url from any given arXiv url """
- if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
- abs_url = url
- pdf_url = url.replace("abs", "pdf") + ".pdf"
- elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url):
- abs_url = url[:-4].replace("pdf", "abs")
- pdf_url = url
- else:
- exception("Couldn't figure out arXiv urls.")
- return pdf_url, abs_url
+ def upload_to_rm(self, filepath):
+ remarkable_dir = self.remarkable_dir.rstrip("/")
+ self.log("Starting upload to reMarkable")
+ if remarkable_dir:
+ status = subprocess.call(
+ [self.rmapi_path, "mkdir", remarkable_dir],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ exception(
+ "Creating directory %s on reMarkable failed"
+ % remarkable_dir
+ )
+ status = subprocess.call(
+ [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ exception("Uploading file %s to reMarkable failed" % filepath)
+ self.log("Upload successful.")
+ def dearxiv(self, input_file):
+ """Remove the arXiv timestamp from a pdf"""
+ self.log("Removing arXiv timestamp")
+ basename = os.path.splitext(input_file)[0]
+ uncompress_file = basename + "_uncompress.pdf"
-def get_pmc_urls(url):
- """Get the pdf and html url from a given PMC url """
- if re.match(
- "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf",
- url,
- ):
- idx = url.index("pdf")
- abs_url = url[: idx - 1]
- pdf_url = url
- elif re.match("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url):
- abs_url = url
- pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually
- else:
- exception("Couldn't figure out PMC urls.")
- return pdf_url, abs_url
-
-
-def get_acm_pdf_url(url):
- page = get_page_with_retry(url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- thea = None
- for a in soup.find_all("a"):
- if a.get("name") == "FullTextPDF":
- thea = a
- break
- if thea is None:
- return None
- href = thea.get("href")
- if href.startswith("http"):
- return href
- else:
- return "https://dl.acm.org/" + href
-
-
-def get_acm_urls(url):
- if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url):
- abs_url = url
- pdf_url = get_acm_pdf_url(url)
- if pdf_url is None:
- exception("Couldn't extract PDF url from ACM citation page.")
- else:
- exception(
- "Couldn't figure out ACM urls, please provide a URL of the "
- "format: http(s)://dl.acm.org/citation.cfm?id=..."
+ status = subprocess.call(
+ [
+ self.pdftk_path,
+ input_file,
+ "output",
+ uncompress_file,
+ "uncompress",
+ ]
)
- return pdf_url, abs_url
+ if not status == 0:
+ exception("pdftk failed to uncompress the pdf.")
+
+ with open(uncompress_file, "rb") as fid:
+ data = fid.read()
+ # Remove the text element
+ data = re.sub(
+ b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+ b"()Tj",
+ data,
+ )
+ # Remove the URL element
+ data = re.sub(
+ b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n",
+ b"",
+ data,
+ )
+ removed_file = basename + "_removed.pdf"
+ with open(removed_file, "wb") as oid:
+ oid.write(data)
-def get_page_with_retry(url):
- """Get the content of an url, retrying up to five times on failure. """
+ output_file = basename + "_dearxiv.pdf"
+ status = subprocess.call(
+ [self.pdftk_path, removed_file, "output", output_file, "compress"]
+ )
+ if not status == 0:
+ exception("pdftk failed to compress the pdf.")
+
+ return output_file
+
+ def run(self, src, filename=None):
+ info = self.get_paper_info(src)
+ clean_filename = self.create_filename(info, filename)
+ tmp_filename = "paper.pdf"
+
+ self.initial_dir = os.getcwd()
+ with tempfile.TemporaryDirectory() as working_dir:
+ os.chdir(working_dir)
+ self.retrieve_pdf(src, tmp_filename)
+ self.check_file_is_pdf(tmp_filename)
+
+ ops = [self.dearxiv, self.crop_pdf, self.shrink_pdf]
+ intermediate_fname = tmp_filename
+ for op in ops:
+ intermediate_fname = op(intermediate_fname)
+ shutil.move(intermediate_fname, clean_filename)
+
+ if self.debug:
+ print("Paused in debug mode in dir: %s" % working_dir)
+ print("Press enter to exit.")
+ return input()
+
+ if self.upload:
+ return self.upload_to_rm(clean_filename)
+
+ target_path = os.path.join(self.initial_dir, clean_filename)
+ while os.path.exists(target_path):
+ base = os.path.splitext(target_path)[0]
+ target_path = base + "_.pdf"
+ shutil.move(clean_filename, target_path)
+ return target_path
+
+
+class ArxivProvider(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def get_abs_pdf_urls(self, url):
+ """Get the pdf and abs url from any given arXiv url """
+ if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
+ abs_url = url
+ pdf_url = url.replace("abs", "pdf") + ".pdf"
+ elif re.match(
+ "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url
+ ):
+ abs_url = url[:-4].replace("pdf", "abs")
+ pdf_url = url
+ else:
+ exception("Couldn't figure out arXiv urls.")
+ return abs_url, pdf_url
- def retry(url, count):
- if count < 5:
- logger.info(
- "Caught error for url %s. Retrying in 5 seconds." % url
- )
- time.sleep(5)
+ def validate(src):
+ """Check if the url is to an arXiv page. """
+ m = re.match(
+ "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src
+ )
+ return not m is None
+
+ def retrieve_pdf(self, src, filename):
+ """ Download the file and save as filename """
+ _, pdf_url = self.get_abs_pdf_urls(src)
+ self.download_url(pdf_url, filename)
+
+ def get_paper_info(self, src):
+ """ Extract the paper's authors, title, and publication year """
+ abs_url, _ = self.get_abs_pdf_urls(src)
+ self.log("Getting paper info from arXiv")
+ page = self.get_page_with_retry(abs_url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ authors = [
+ x["content"]
+ for x in soup.find_all("meta", {"name": "citation_author"})
+ ]
+ authors = [x.split(",")[0].strip() for x in authors]
+ title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+ date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+ return dict(title=title, date=date, authors=authors)
+
+
+class PMCProvider(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def get_abs_pdf_urls(self, url):
+ """Get the pdf and html url from a given PMC url """
+ if re.match(
+ "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf",
+ url,
+ ):
+ idx = url.index("pdf")
+ abs_url = url[: idx - 1]
+ pdf_url = url
+ elif re.match(
+ "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url
+ ):
+ abs_url = url
+ pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually
else:
- exception("Failed to download url: %s" % url)
+ exception("Couldn't figure out PMC urls.")
+ return abs_url, pdf_url
- count = 0
- while True:
- count += 1
- try:
- res = requests.get(url, headers=HEADERS)
- except requests.exceptions.ConnectionError:
- retry(url, count)
- continue
- if res.ok:
- logger.info("Downloading url: %s" % url)
- return res.content
+ def validate(src):
+ m = re.fullmatch(
+ "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src
+ )
+ return not m is None
+
+ def retrieve_pdf(self, src, filename):
+ _, pdf_url = self.get_abs_pdf_urls(src)
+ self.download_url(pdf_url, filename)
+
+ def get_paper_info(self, src):
+ """ Extract the paper's authors, title, and publication year """
+ self.log("Getting paper info from PMC")
+ page = self.get_page_with_retry(src)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ authors = [
+ x["content"]
+ for x in soup.find_all("meta", {"name": "citation_authors"})
+ ]
+ # We only use last names, and this method is a guess at best. I'm open to
+ # more advanced approaches.
+ authors = [
+ x.strip().split(" ")[-1].strip() for x in authors[0].split(",")
+ ]
+ title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+ date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+ if re.match("\w+\ \d{4}", date):
+ date = date.split(" ")[-1]
+ else:
+ date = date.replace(" ", "_")
+ return dict(title=title, date=date, authors=authors)
+
+
+class ACMProvider(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def get_acm_pdf_url(self, url):
+ page = self.get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ thea = None
+ for a in soup.find_all("a"):
+ if a.get("name") == "FullTextPDF":
+ thea = a
+ break
+ if thea is None:
+ return None
+ href = thea.get("href")
+ if href.startswith("http"):
+ return href
+ else:
+ return "https://dl.acm.org/" + href
+
+ def get_abs_pdf_urls(self, url):
+ if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url):
+ abs_url = url
+ pdf_url = self.get_acm_pdf_url(url)
+ if pdf_url is None:
+ exception(
+ "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
+ )
else:
- retry(url, count)
+ exception(
+ "Couldn't figure out ACM urls, please provide a URL of the "
+ "format: http(s)://dl.acm.org/citation.cfm?id=..."
+ )
+ return abs_url, pdf_url
+
+ def retrieve_pdf(self, src, filename):
+ _, pdf_url = self.get_abs_pdf_urls(src)
+ self.download_url(pdf_url, filename)
+
+ def validate(src):
+ m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src)
+ return not m is None
+
+ def get_paper_info(self, src):
+ """ Extract the paper's authors, title, and publication year """
+ self.log("Getting paper info from ACM")
+ page = self.get_page_with_retry(src)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ authors = [
+ x["content"]
+ for x in soup.find_all("meta", {"name": "citation_authors"})
+ ]
+ # We only use last names, and this method is a guess. I'm open to more
+ # advanced approaches.
+ authors = [
+ x.strip().split(",")[0].strip() for x in authors[0].split(";")
+ ]
+ title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+ date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+ if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
+ self.warn(
+ "Couldn't extract year from ACM page, please raise an "
+ "issue on GitHub so I can fix it: %s" % GITHUB_URL
+ )
+ date = date.strip().split("/")[-1]
+ return dict(title=title, date=date, authors=authors)
-def download_url(url, filename):
- """Download the content of an url and save it to a filename """
- logger.info("Downloading file at url: %s" % url)
- content = get_page_with_retry(url)
- with open(filename, "wb") as fid:
- fid.write(content)
+class LocalFileProvider(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ def validate(src):
+ return os.path.exists(src)
-def dearxiv(input_file, pdftk_path="pdftk"):
- """Remove the arXiv timestamp from a pdf"""
- logger.info("Removing arXiv timestamp")
- basename = os.path.splitext(input_file)[0]
- uncompress_file = basename + "_uncompress.pdf"
+ def retrieve_pdf(self, src, filename):
+ source = os.path.join(self.initial_dir, src)
+ shutil.copy(source, filename)
- status = subprocess.call(
- [pdftk_path, input_file, "output", uncompress_file, "uncompress"]
- )
- if not status == 0:
- exception("pdftk failed to uncompress the pdf.")
-
- with open(uncompress_file, "rb") as fid:
- data = fid.read()
- # Remove the text element
- data = re.sub(
- b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
- b"()Tj",
- data,
- )
- # Remove the URL element
- data = re.sub(
- b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n",
- b"",
- data,
- )
+ def get_paper_info(self, src):
+ return {"filename": src}
- removed_file = basename + "_removed.pdf"
- with open(removed_file, "wb") as oid:
- oid.write(data)
+ def create_filename(self, info, filename=None):
+ if not filename is None:
+ return filename
+ return os.path.basename(info["filename"])
- output_file = basename + "_dearxiv.pdf"
- status = subprocess.call(
- [pdftk_path, removed_file, "output", output_file, "compress"]
- )
- if not status == 0:
- exception("pdftk failed to compress the pdf.")
- return output_file
+class PdfUrlProvider(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ def validate(src):
+ try:
+ result = urllib.parse.urlparse(src)
+ return all([result.scheme, result.netloc, result.path])
+ except:
+ return False
-def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
- logger.info("Cropping pdf file")
- status = subprocess.call(
- [pdfcrop_path, "--margins", "15 40 15 15", filepath],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- logger.warning("Failed to crop the pdf file at: %s" % filepath)
- return filepath
- cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
- if not os.path.exists(cropped_file):
- logger.warning(
- "Can't find cropped file '%s' where expected." % cropped_file
- )
- return filepath
- return cropped_file
-
-
-def shrink_pdf(filepath, gs_path="gs"):
- logger.info("Shrinking pdf file")
- output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
- status = subprocess.call(
- [
- "gs",
- "-sDEVICE=pdfwrite",
- "-dCompatibilityLevel=1.4",
- "-dPDFSETTINGS=/printer",
- "-dNOPAUSE",
- "-dBATCH",
- "-dQUIET",
- "-sOutputFile=%s" % output_file,
- filepath,
- ]
- )
- if not status == 0:
- logger.warning("Failed to shrink the pdf file")
- return filepath
- return output_file
-
-
-def get_paper_info_arxiv(url):
- """ Extract the paper's authors, title, and publication year """
- logger.info("Getting paper info from arXiv")
- page = get_page_with_retry(url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- authors = [
- x["content"]
- for x in soup.find_all("meta", {"name": "citation_author"})
- ]
- authors = [x.split(",")[0].strip() for x in authors]
- title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
- date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
- return dict(title=title, date=date, authors=authors)
-
-
-def get_paper_info_pmc(url):
- """ Extract the paper's authors, title, and publication year """
- logger.info("Getting paper info from PMC")
- page = get_page_with_retry(url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- authors = [
- x["content"]
- for x in soup.find_all("meta", {"name": "citation_authors"})
- ]
- # We only use last names, and this method is a guess at best. I'm open to
- # more advanced approaches.
- authors = [x.strip().split(" ")[-1].strip() for x in authors[0].split(",")]
- title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
- date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
- if re.match("\w+\ \d{4}", date):
- date = date.split(" ")[-1]
- else:
- date = date.replace(" ", "_")
- return dict(title=title, date=date, authors=authors)
-
-
-def get_paper_info_acm(url):
- """ Extract the paper's authors, title, and publication year """
- logger.info("Getting paper info from ACM")
- page = get_page_with_retry(url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- authors = [
- x["content"]
- for x in soup.find_all("meta", {"name": "citation_authors"})
- ]
- # We only use last names, and this method is a guess. I'm open to more
- # advanced approaches.
- authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")]
- title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
- date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
- if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
- logger.warning(
- "Couldn't extract year from ACM page, please raise an "
- "issue on GitHub so I can fix it: %s",
- GITHUB_URL,
- )
- date = date.strip().split("/")[-1]
- return dict(title=title, date=date, authors=authors)
-
-
-def generate_filename(info):
- """ Generate a nice filename for a paper given the info dict """
- # we assume that the list of authors is lastname only.
- logger.info("Generating output filename")
- if len(info["authors"]) > 3:
- author_part = info["authors"][0] + "_et_al"
- else:
- author_part = "_".join(info["authors"])
- author_part = author_part.replace(" ", "_")
- title = info["title"].replace(",", "").replace(":", "").replace(" ", "_")
- title_part = titlecase.titlecase(title)
- year_part = info["date"].split("/")[0]
- return author_part + "_-_" + title_part + "_" + year_part + ".pdf"
-
-
-def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"):
- remarkable_dir = remarkable_dir.rstrip("/")
- logger.info("Starting upload to reMarkable")
- if remarkable_dir:
- status = subprocess.call(
- [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL
- )
- if not status == 0:
+ def retrieve_pdf(self, url, filename):
+ self.download_url(url, filename)
+
+ def get_paper_info(self, src):
+ return None
+
+ def create_filename(self, info, filename=None):
+ if filename is None:
exception(
- "Creating directory %s on reMarkable failed" % remarkable_dir
+ "Filename must be provided with PDFUrlProvider (use --filename)"
)
- status = subprocess.call(
- [rmapi_path, "put", filepath, remarkable_dir + "/"],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- exception("Uploading file %s to reMarkable failed" % filepath)
- logger.info("Upload successful.")
+ return filename
+
+
+def exception(msg):
+ print("ERROR: " + msg, file=sys.stderr)
+ print("Error occurred. Exiting.", file=sys.stderr)
+ raise SystemExit(1)
def parse_args():
@@ -420,82 +549,33 @@ def parse_args():
return parser.parse_args()
-@logger.catch
def main():
args = parse_args()
- if os.path.exists(args.input):
- mode = "local_file"
- elif arxiv_url(args.input):
- mode = "arxiv_url"
- elif pmc_url(args.input):
- mode = "pmc_url"
- elif acm_url(args.input):
- mode = "acm_url"
- elif valid_url(args.input):
- if args.filename is None:
- exception(
- "Filename must be provided with pdf url (use --filename)"
- )
- mode = "pdf_url"
- else:
- exception("Input not a valid url, arxiv url, or existing file.")
-
- if not args.verbose:
- logger.remove(0)
-
- start_wd = os.getcwd()
-
- with tempfile.TemporaryDirectory() as working_dir:
- if mode == "local_file":
- shutil.copy(args.input, working_dir)
- filename = os.path.basename(args.input)
- clean_filename = args.filename if args.filename else filename
-
- os.chdir(working_dir)
- if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]:
- filename = "paper.pdf"
- if mode == "arxiv_url":
- pdf_url, abs_url = get_arxiv_urls(args.input)
- paper_info = get_paper_info_arxiv(abs_url)
- elif mode == "pmc_url":
- pdf_url, abs_url = get_pmc_urls(args.input)
- paper_info = get_paper_info_pmc(abs_url)
- elif mode == "acm_url":
- pdf_url, abs_url = get_acm_urls(args.input)
- paper_info = get_paper_info_acm(abs_url)
- else:
- pdf_url = args.input
- download_url(pdf_url, filename)
- if not check_file_is_pdf(filename):
- exception("Downloaded file isn't a valid pdf file.")
- if args.filename:
- clean_filename = args.filename
- else:
- clean_filename = generate_filename(paper_info)
-
- dearxived = dearxiv(filename, pdftk_path=args.pdftk)
- cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop)
- shrinked = shrink_pdf(cropped)
- shutil.move(shrinked, clean_filename)
-
- if args.debug:
- print("Paused in debug mode in dir: %s" % working_dir)
- print("Press enter to exit.")
- return input()
-
- if args.no_upload:
- if os.path.exists(os.path.join(start_wd, clean_filename)):
- tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf"
- shutil.move(clean_filename, os.path.join(start_wd, tmpfname))
- else:
- shutil.move(clean_filename, start_wd)
- else:
- upload_to_rm(
- clean_filename,
- remarkable_dir=args.remarkable_dir,
- rmapi_path=args.rmapi,
- )
+ providers = [
+ ArxivProvider,
+ PMCProvider,
+ ACMProvider,
+ LocalFileProvider,
+ PdfUrlProvider,
+ ]
+
+ provider = next((p for p in providers if p.validate(args.input)), None)
+ if provider is None:
+ exception("Input not valid, no provider can handle this source.")
+
+ prov = provider(
+ args.verbose,
+ not args.no_upload,
+ args.debug,
+ args.remarkable_dir,
+ args.rmapi,
+ args.pdfcrop,
+ args.pdftk,
+ args.gs,
+ )
+
+ prov.run(args.input, filename=args.filename)
if __name__ == "__main__":
diff --git a/poetry.lock b/poetry.lock
index d8a1205..893007f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,16 +1,5 @@
[[package]]
category = "main"
-description = "Produce colored terminal text with an xml-like markup"
-name = "ansimarkup"
-optional = false
-python-versions = "*"
-version = "1.4.0"
-
-[package.dependencies]
-colorama = "*"
-
-[[package]]
-category = "main"
description = "Screen-scraping library"
name = "beautifulsoup4"
optional = false
@@ -22,19 +11,6 @@ soupsieve = ">=1.2"
[[package]]
category = "main"
-description = "Pretty and helpful exceptions, automatically"
-name = "better-exceptions-fork"
-optional = false
-python-versions = "*"
-version = "0.2.1.post6"
-
-[package.dependencies]
-ansimarkup = ">=1.3.0"
-colorama = "*"
-pygments = ">=2.2.0"
-
-[[package]]
-category = "main"
description = "Dummy package for Beautiful Soup"
name = "bs4"
optional = false
@@ -62,14 +38,6 @@ version = "3.0.4"
[[package]]
category = "main"
-description = "Cross-platform colored terminal text."
-name = "colorama"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "0.4.1"
-
-[[package]]
-category = "main"
description = "Internationalized Domain Names in Applications (IDNA)"
name = "idna"
optional = false
@@ -78,27 +46,6 @@ version = "2.8"
[[package]]
category = "main"
-description = "Python logging made (stupidly) simple"
-name = "loguru"
-optional = false
-python-versions = ">=3.5"
-version = "0.2.5"
-
-[package.dependencies]
-ansimarkup = ">=1.4.0"
-better-exceptions-fork = ">=0.2.1.post6"
-colorama = ">=0.3.4"
-
-[[package]]
-category = "main"
-description = "Pygments is a syntax highlighting package written in Python."
-name = "pygments"
-optional = false
-python-versions = "*"
-version = "2.3.1"
-
-[[package]]
-category = "main"
description = "Python HTTP for Humans."
name = "requests"
optional = false
@@ -128,20 +75,15 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
version = "1.24.1"
[metadata]
-content-hash = "b92b4b1d2c4f9d3181044c1ad99fd9bfa49e8618c6ff5de7bd64c557bcc27e39"
+content-hash = "bf39364b4d9cc98c89d858338ce8e7609e35fdd4a7e5cfe256f768c12ed7cb82"
python-versions = "^3.5"
[metadata.hashes]
-ansimarkup = ["06365e3ef89a12734fc408b2449cb4642d5fe2e603e95e7296eff9e98a0fe0b4", "174d920481416cec8d5a707af542d6fba25a1df1c21d8996479c32ba453649a4"]
beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"]
-better-exceptions-fork = ["5f0983da51e956dbdaf8b9a3d10e2774b382ce6c6ff2e54685c33e2dbe8f1472"]
bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"]
certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"]
chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"]
-colorama = ["05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d", "f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"]
idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"]
-loguru = ["68297d9f23064c2f4764bb5d0c5c767f3ed7f9fc1218244841878f5fc7c94add", "ebac59630946721fd6207264679b267a8bdc290b086226067d6aad86830e3123"]
-pygments = ["5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", "e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"]
requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"]
soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"]
urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"]
diff --git a/pyproject.toml b/pyproject.toml
index 6f67ecd..2c28224 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,6 @@ license = "MIT"
python = "^3.5"
bs4 = "^0.0.1"
requests = "^2.21"
-loguru = "^0.2.5"
[tool.poetry.dev-dependencies]
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..2ec59d8
--- /dev/null
+++ b/test.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__author__ = "G.J.J. van den Burg"
+
+"""Tests"""
+
+import unittest
+import tempfile
+import hashlib
+import shutil
+import os
+
+from arxiv2remarkable import (
+ ArxivProvider,
+ PMCProvider,
+ ACMProvider,
+ LocalFileProvider,
+ PdfUrlProvider,
+)
+
+
+def md5sum(filename):
+ blocksize = 65536
+ hasher = hashlib.md5()
+ with open(filename, "rb") as fid:
+ buf = fid.read(blocksize)
+ while len(buf) > 0:
+ hasher.update(buf)
+ buf = fid.read(blocksize)
+ return hasher.hexdigest()
+
+
+class Tests(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.original_dir = os.getcwd()
+
+ def setUp(self):
+ self.test_dir = tempfile.mkdtemp()
+ os.chdir(self.test_dir)
+
+ def tearDown(self):
+ os.chdir(self.original_dir)
+ shutil.rmtree(self.test_dir)
+
+ def test_arxiv(self):
+ prov = ArxivProvider(upload=False)
+ url = "https://arxiv.org/abs/1811.11242v1"
+ exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+ fsize = os.path.getsize(filename)
+ self.assertTrue(1054082 < fsize <= 1056082)
+
+ def test_pmc(self):
+ prov = PMCProvider(upload=False)
+ url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
+ exp_filename = (
+ "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf"
+ )
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+ fsize = os.path.getsize(filename)
+ self.assertTrue(376640 < fsize <= 378640)
+
+ def test_acm(self):
+ prov = ACMProvider(upload=False)
+ url = "https://dl.acm.org/citation.cfm?id=3300356"
+ exp_filename = "Muller_et_al_-_How_Data_Science_Workers_Work_With_Data_Discovery_Capture_Curation_Design_Creation_2019.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+ fsize = os.path.getsize(filename)
+ self.assertTrue(1691444 < fsize <= 1693444)
+
+ def test_local(self):
+ local_filename = "test.pdf"
+ with open(local_filename, "w") as fp:
+ fp.write(
+ "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF"
+ )
+ prov = LocalFileProvider(upload=False)
+ filename = prov.run(local_filename)
+ self.assertEqual("test_.pdf", os.path.basename(filename))
+ fsize = os.path.getsize(filename)
+ self.assertTrue(5843 < fsize <= 7843)
+
+ def test_pdfurl(self):
+ prov = PdfUrlProvider(upload=False)
+ url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
+ filename = prov.run(url, filename="test.pdf")
+ self.assertEqual("test.pdf", os.path.basename(filename))
+ fsize = os.path.getsize(filename)
+ self.assertTrue(1828169 < fsize <= 1830169)
+
+if __name__ == "__main__":
+ unittest.main()