diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | README.md | 27 | ||||
| -rwxr-xr-x | arxiv2remarkable.py | 852 | ||||
| -rw-r--r-- | poetry.lock | 60 | ||||
| -rw-r--r-- | pyproject.toml | 1 | ||||
| -rw-r--r-- | test.py | 97 |
6 files changed, 579 insertions, 459 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ @@ -57,16 +57,18 @@ And here's an example with verbose mode enabled that shows everything the script does: ```bash $ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242 -2019-02-03 18:11:41.816 | INFO | __main__:download_url:106 - Downloading file at url: https://arxiv.org/pdf/1811.11242v1.pdf -2019-02-03 18:11:46.833 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/pdf/1811.11242v1.pdf -2019-02-03 18:11:46.835 | INFO | __main__:get_paper_info:194 - Getting paper info from arXiv -2019-02-03 18:11:47.496 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/abs/1811.11242v1 -2019-02-03 18:11:47.508 | INFO | __main__:generate_filename:206 - Generating output filename -2019-02-03 18:11:47.508 | INFO | __main__:dearxiv:114 - Removing arXiv timestamp -2019-02-03 18:11:49.221 | INFO | __main__:crop_pdf:154 - Cropping pdf file -2019-02-03 18:11:53.247 | INFO | __main__:shrink_pdf:172 - Shrinking pdf file -2019-02-03 18:11:54.802 | INFO | __main__:upload_to_rm:218 - Starting upload to reMarkable -2019-02-03 18:11:57.767 | INFO | __main__:upload_to_rm:223 - Upload successful. +2019-05-30 00:38:27 - INFO - Starting ArxivProvider +2019-05-30 00:38:27 - INFO - Getting paper info from arXiv +2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242 +2019-05-30 00:38:27 - INFO - Generating output filename +2019-05-30 00:38:27 - INFO - Created filename: Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf +2019-05-30 00:38:27 - INFO - Downloading file at url: https://arxiv.org/pdf/1811.11242.pdf +2019-05-30 00:38:32 - INFO - Downloading url: https://arxiv.org/pdf/1811.11242.pdf +2019-05-30 00:38:32 - INFO - Removing arXiv timestamp +2019-05-30 00:38:34 - INFO - Cropping pdf file +2019-05-30 00:38:37 - INFO - Shrinking pdf file +2019-05-30 00:38:38 - INFO - Starting upload to reMarkable +2019-05-30 00:38:42 - INFO - Upload successful. ``` ## Dependencies @@ -79,21 +81,20 @@ The script requires the following external programs to be available: - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) -If these scripts are not available on the PATH variable, you can supply them +If these scripts are not available on the ``PATH`` variable, you can supply them with the relevant options to the script. The script also needs the following Python packages: - [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML - [requests](https://pypi.org/project/requests/): getting HTML -- [loguru](https://pypi.org/project/loguru/): easy logging - [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF - [titlecase](https://pypi.org/project/titlecase/): fancy titles You can use this line: ```bash -pip install --user bs4 requests loguru PyPDF2 titlecase +pip install --user bs4 requests PyPDF2 titlecase ``` # Notes diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 61a3667..08beaca 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -1,16 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" -Given an arXiv paper url this script: +__version__ = "0.2.0" +__author__ = "G.J.J. van den Burg" -1. Downloads the paper -2. Strips the timestamp -3. Crops the pdf to remove unnecessary borders -4. Shrinks the pdf to reduce the filesize -5. Renames it using the format: - '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf -6. Uploads it to the reMarkable using rMapi. +""" +Download a paper from various sources and send it to the reMarkable. Author: G.J.J. van den Burg Date: 2019-02-02 @@ -19,8 +14,10 @@ License: MIT """ import PyPDF2 +import abc import argparse import bs4 +import datetime import os import re import requests @@ -32,8 +29,6 @@ import time import titlecase import urllib.parse -from loguru import logger - GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" HEADERS = { @@ -43,334 +38,468 @@ HEADERS = { } -def exception(msg): - print("ERROR: " + msg, file=sys.stderr) - print("Error occurred. Exiting.", file=sys.stderr) - raise SystemExit(1) - - -def arxiv_url(url): - """Check if the url is to an arXiv page. - - >>> validate_url("https://arxiv.org/abs/1811.11242") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("http://arxiv.org/abs/1811.11242") - True - >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("https://arxiv.org/abs/1811.11242v1") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf") - True - >>> validate_url("https://gertjanvandenburg.com") - False - """ - m = re.match( - "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url - ) - return not m is None - +class Provider(metaclass=abc.ABCMeta): + """ ABC for providers of pdf sources """ -def pmc_url(url): - m = re.fullmatch( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url - ) - return not m is None - - -def acm_url(url): - m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url) - return not m is None - - -def valid_url(url): - try: - result = urllib.parse.urlparse(url) - return all([result.scheme, result.netloc, result.path]) - except: - return False + def __init__( + self, + verbose=False, + upload=True, + debug=False, + remarkable_dir="/", + rmapi_path="rmapi", + pdfcrop_path="pdfcrop", + pdftk_path="pdftk", + gs_path="gs", + ): + self.verbose = verbose + self.upload = upload + self.debug = debug + self.remarkable_dir = remarkable_dir + self.rmapi_path = rmapi_path + self.pdfcrop_path = pdfcrop_path + self.pdftk_path = pdftk_path + self.gs_path = gs_path + + self.log("Starting %s" % type(self).__name__) + + def log(self, msg, mode="info"): + if not self.verbose: + return + if not mode in ["info", "warning"]: + raise ValueError("unknown logging mode.") + now = datetime.datetime.now() + print( + now.strftime("%Y-%m-%d %H:%M:%S") + + " - " + + mode.upper() + + " - " + + msg + ) + def warn(self, msg): + self.log(msg, mode="warning") + + @staticmethod + @abc.abstractmethod + def validate(src): + """ Validate whether ``src`` is appropriate for this provider """ + + @abc.abstractmethod + def retrieve_pdf(self, src, filename): + """ Download pdf from src and save to filename """ + + @abc.abstractmethod + def get_paper_info(self, src): + """ Retrieve the title/author (surnames)/year information """ + + def create_filename(self, info, filename=None): + """ Generate filename using the info dict or filename if provided """ + if not filename is None: + return filename + # we assume that the list of authors is surname only. + self.log("Generating output filename") + if len(info["authors"]) > 3: + author_part = info["authors"][0] + "_et_al" + else: + author_part = "_".join(info["authors"]) + author_part = author_part.replace(" ", "_") + title = info["title"].replace(",", "").replace(":", "") + title_part = titlecase.titlecase(title).replace(" ", "_") + year_part = info["date"].split("/")[0] + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" + self.log("Created filename: %s" % name) + return name + + def crop_pdf(self, filepath): + self.log("Cropping pdf file") + status = subprocess.call( + [self.pdfcrop_path, "--margins", "15 40 15 15", filepath], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + self.warn("Failed to crop the pdf file at: %s" % filepath) + return filepath + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + if not os.path.exists(cropped_file): + self.warn( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file -def check_file_is_pdf(filename): - try: - PyPDF2.PdfFileReader(open(filename, "rb")) - return True - except PyPDF2.utils.PdfReadError: - return False + def shrink_pdf(self, filepath): + self.log("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + self.warn("Failed to shrink the pdf file") + return filepath + return output_file + def check_file_is_pdf(self, filename): + try: + fp = open(filename, "rb") + pdf = PyPDF2.PdfFileReader(fp, strict=False) + fp.close() + del pdf + return True + except PyPDF2.utils.PdfReadError: + exception("Downloaded file isn't a valid pdf file.") + + def download_url(self, url, filename): + """Download the content of an url and save it to a filename """ + self.log("Downloading file at url: %s" % url) + content = self.get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + def get_page_with_retry(self, url, tries=5): + count = 0 + while count < tries: + count += 1 + error = False + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + time.sleep(5) + self.warn("Error getting url %s. Retrying in 5 seconds" % url) + continue + self.log("Downloading url: %s" % url) + return res.content -def get_arxiv_urls(url): - """Get the pdf and abs url from any given arXiv url """ - if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url): - abs_url = url - pdf_url = url.replace("abs", "pdf") + ".pdf" - elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url): - abs_url = url[:-4].replace("pdf", "abs") - pdf_url = url - else: - exception("Couldn't figure out arXiv urls.") - return pdf_url, abs_url + def upload_to_rm(self, filepath): + remarkable_dir = self.remarkable_dir.rstrip("/") + self.log("Starting upload to reMarkable") + if remarkable_dir: + status = subprocess.call( + [self.rmapi_path, "mkdir", remarkable_dir], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception( + "Creating directory %s on reMarkable failed" + % remarkable_dir + ) + status = subprocess.call( + [self.rmapi_path, "put", filepath, remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception("Uploading file %s to reMarkable failed" % filepath) + self.log("Upload successful.") + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + self.log("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" -def get_pmc_urls(url): - """Get the pdf and html url from a given PMC url """ - if re.match( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf", - url, - ): - idx = url.index("pdf") - abs_url = url[: idx - 1] - pdf_url = url - elif re.match("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url): - abs_url = url - pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually - else: - exception("Couldn't figure out PMC urls.") - return pdf_url, abs_url - - -def get_acm_pdf_url(url): - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - thea = None - for a in soup.find_all("a"): - if a.get("name") == "FullTextPDF": - thea = a - break - if thea is None: - return None - href = thea.get("href") - if href.startswith("http"): - return href - else: - return "https://dl.acm.org/" + href - - -def get_acm_urls(url): - if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url): - abs_url = url - pdf_url = get_acm_pdf_url(url) - if pdf_url is None: - exception("Couldn't extract PDF url from ACM citation page.") - else: - exception( - "Couldn't figure out ACM urls, please provide a URL of the " - "format: http(s)://dl.acm.org/citation.cfm?id=..." + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] ) - return pdf_url, abs_url + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) -def get_page_with_retry(url): - """Get the content of an url, retrying up to five times on failure. """ + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file + + def run(self, src, filename=None): + info = self.get_paper_info(src) + clean_filename = self.create_filename(info, filename) + tmp_filename = "paper.pdf" + + self.initial_dir = os.getcwd() + with tempfile.TemporaryDirectory() as working_dir: + os.chdir(working_dir) + self.retrieve_pdf(src, tmp_filename) + self.check_file_is_pdf(tmp_filename) + + ops = [self.dearxiv, self.crop_pdf, self.shrink_pdf] + intermediate_fname = tmp_filename + for op in ops: + intermediate_fname = op(intermediate_fname) + shutil.move(intermediate_fname, clean_filename) + + if self.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if self.upload: + return self.upload_to_rm(clean_filename) + + target_path = os.path.join(self.initial_dir, clean_filename) + while os.path.exists(target_path): + base = os.path.splitext(target_path)[0] + target_path = base + "_.pdf" + shutil.move(clean_filename, target_path) + return target_path + + +class ArxivProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and abs url from any given arXiv url """ + if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match( + "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url + ): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return abs_url, pdf_url - def retry(url, count): - if count < 5: - logger.info( - "Caught error for url %s. Retrying in 5 seconds." % url - ) - time.sleep(5) + def validate(src): + """Check if the url is to an arXiv page. """ + m = re.match( + "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src + ) + return not m is None + + def retrieve_pdf(self, src, filename): + """ Download the file and save as filename """ + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + abs_url, _ = self.get_abs_pdf_urls(src) + self.log("Getting paper info from arXiv") + page = self.get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_author"}) + ] + authors = [x.split(",")[0].strip() for x in authors] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + return dict(title=title, date=date, authors=authors) + + +class PMCProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and html url from a given PMC url """ + if re.match( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf", + url, + ): + idx = url.index("pdf") + abs_url = url[: idx - 1] + pdf_url = url + elif re.match( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url + ): + abs_url = url + pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually else: - exception("Failed to download url: %s" % url) + exception("Couldn't figure out PMC urls.") + return abs_url, pdf_url - count = 0 - while True: - count += 1 - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - retry(url, count) - continue - if res.ok: - logger.info("Downloading url: %s" % url) - return res.content + def validate(src): + m = re.fullmatch( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src + ) + return not m is None + + def retrieve_pdf(self, src, filename): + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + self.log("Getting paper info from PMC") + page = self.get_page_with_retry(src) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_authors"}) + ] + # We only use last names, and this method is a guess at best. I'm open to + # more advanced approaches. + authors = [ + x.strip().split(" ")[-1].strip() for x in authors[0].split(",") + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + if re.match("\w+\ \d{4}", date): + date = date.split(" ")[-1] + else: + date = date.replace(" ", "_") + return dict(title=title, date=date, authors=authors) + + +class ACMProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_acm_pdf_url(self, url): + page = self.get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + thea = None + for a in soup.find_all("a"): + if a.get("name") == "FullTextPDF": + thea = a + break + if thea is None: + return None + href = thea.get("href") + if href.startswith("http"): + return href + else: + return "https://dl.acm.org/" + href + + def get_abs_pdf_urls(self, url): + if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url): + abs_url = url + pdf_url = self.get_acm_pdf_url(url) + if pdf_url is None: + exception( + "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" + ) else: - retry(url, count) + exception( + "Couldn't figure out ACM urls, please provide a URL of the " + "format: http(s)://dl.acm.org/citation.cfm?id=..." + ) + return abs_url, pdf_url + + def retrieve_pdf(self, src, filename): + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def validate(src): + m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src) + return not m is None + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + self.log("Getting paper info from ACM") + page = self.get_page_with_retry(src) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_authors"}) + ] + # We only use last names, and this method is a guess. I'm open to more + # advanced approaches. + authors = [ + x.strip().split(",")[0].strip() for x in authors[0].split(";") + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): + self.warn( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so I can fix it: %s" % GITHUB_URL + ) + date = date.strip().split("/")[-1] + return dict(title=title, date=date, authors=authors) -def download_url(url, filename): - """Download the content of an url and save it to a filename """ - logger.info("Downloading file at url: %s" % url) - content = get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) +class LocalFileProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def validate(src): + return os.path.exists(src) -def dearxiv(input_file, pdftk_path="pdftk"): - """Remove the arXiv timestamp from a pdf""" - logger.info("Removing arXiv timestamp") - basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" + def retrieve_pdf(self, src, filename): + source = os.path.join(self.initial_dir, src) + shutil.copy(source, filename) - status = subprocess.call( - [pdftk_path, input_file, "output", uncompress_file, "uncompress"] - ) - if not status == 0: - exception("pdftk failed to uncompress the pdf.") - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) + def get_paper_info(self, src): + return {"filename": src} - removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) + def create_filename(self, info, filename=None): + if not filename is None: + return filename + return os.path.basename(info["filename"]) - output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - exception("pdftk failed to compress the pdf.") - return output_file +class PdfUrlProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def validate(src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False -def crop_pdf(filepath, pdfcrop_path="pdfcrop"): - logger.info("Cropping pdf file") - status = subprocess.call( - [pdfcrop_path, "--margins", "15 40 15 15", filepath], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - logger.warning("Failed to crop the pdf file at: %s" % filepath) - return filepath - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - if not os.path.exists(cropped_file): - logger.warning( - "Can't find cropped file '%s' where expected." % cropped_file - ) - return filepath - return cropped_file - - -def shrink_pdf(filepath, gs_path="gs"): - logger.info("Shrinking pdf file") - output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" - status = subprocess.call( - [ - "gs", - "-sDEVICE=pdfwrite", - "-dCompatibilityLevel=1.4", - "-dPDFSETTINGS=/printer", - "-dNOPAUSE", - "-dBATCH", - "-dQUIET", - "-sOutputFile=%s" % output_file, - filepath, - ] - ) - if not status == 0: - logger.warning("Failed to shrink the pdf file") - return filepath - return output_file - - -def get_paper_info_arxiv(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from arXiv") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_author"}) - ] - authors = [x.split(",")[0].strip() for x in authors] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - return dict(title=title, date=date, authors=authors) - - -def get_paper_info_pmc(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from PMC") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_authors"}) - ] - # We only use last names, and this method is a guess at best. I'm open to - # more advanced approaches. - authors = [x.strip().split(" ")[-1].strip() for x in authors[0].split(",")] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - if re.match("\w+\ \d{4}", date): - date = date.split(" ")[-1] - else: - date = date.replace(" ", "_") - return dict(title=title, date=date, authors=authors) - - -def get_paper_info_acm(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from ACM") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_authors"}) - ] - # We only use last names, and this method is a guess. I'm open to more - # advanced approaches. - authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): - logger.warning( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so I can fix it: %s", - GITHUB_URL, - ) - date = date.strip().split("/")[-1] - return dict(title=title, date=date, authors=authors) - - -def generate_filename(info): - """ Generate a nice filename for a paper given the info dict """ - # we assume that the list of authors is lastname only. - logger.info("Generating output filename") - if len(info["authors"]) > 3: - author_part = info["authors"][0] + "_et_al" - else: - author_part = "_".join(info["authors"]) - author_part = author_part.replace(" ", "_") - title = info["title"].replace(",", "").replace(":", "").replace(" ", "_") - title_part = titlecase.titlecase(title) - year_part = info["date"].split("/")[0] - return author_part + "_-_" + title_part + "_" + year_part + ".pdf" - - -def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"): - remarkable_dir = remarkable_dir.rstrip("/") - logger.info("Starting upload to reMarkable") - if remarkable_dir: - status = subprocess.call( - [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL - ) - if not status == 0: + def retrieve_pdf(self, url, filename): + self.download_url(url, filename) + + def get_paper_info(self, src): + return None + + def create_filename(self, info, filename=None): + if filename is None: exception( - "Creating directory %s on reMarkable failed" % remarkable_dir + "Filename must be provided with PDFUrlProvider (use --filename)" ) - status = subprocess.call( - [rmapi_path, "put", filepath, remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception("Uploading file %s to reMarkable failed" % filepath) - logger.info("Upload successful.") + return filename + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + raise SystemExit(1) def parse_args(): @@ -420,82 +549,33 @@ def parse_args(): return parser.parse_args() -@logger.catch def main(): args = parse_args() - if os.path.exists(args.input): - mode = "local_file" - elif arxiv_url(args.input): - mode = "arxiv_url" - elif pmc_url(args.input): - mode = "pmc_url" - elif acm_url(args.input): - mode = "acm_url" - elif valid_url(args.input): - if args.filename is None: - exception( - "Filename must be provided with pdf url (use --filename)" - ) - mode = "pdf_url" - else: - exception("Input not a valid url, arxiv url, or existing file.") - - if not args.verbose: - logger.remove(0) - - start_wd = os.getcwd() - - with tempfile.TemporaryDirectory() as working_dir: - if mode == "local_file": - shutil.copy(args.input, working_dir) - filename = os.path.basename(args.input) - clean_filename = args.filename if args.filename else filename - - os.chdir(working_dir) - if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]: - filename = "paper.pdf" - if mode == "arxiv_url": - pdf_url, abs_url = get_arxiv_urls(args.input) - paper_info = get_paper_info_arxiv(abs_url) - elif mode == "pmc_url": - pdf_url, abs_url = get_pmc_urls(args.input) - paper_info = get_paper_info_pmc(abs_url) - elif mode == "acm_url": - pdf_url, abs_url = get_acm_urls(args.input) - paper_info = get_paper_info_acm(abs_url) - else: - pdf_url = args.input - download_url(pdf_url, filename) - if not check_file_is_pdf(filename): - exception("Downloaded file isn't a valid pdf file.") - if args.filename: - clean_filename = args.filename - else: - clean_filename = generate_filename(paper_info) - - dearxived = dearxiv(filename, pdftk_path=args.pdftk) - cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop) - shrinked = shrink_pdf(cropped) - shutil.move(shrinked, clean_filename) - - if args.debug: - print("Paused in debug mode in dir: %s" % working_dir) - print("Press enter to exit.") - return input() - - if args.no_upload: - if os.path.exists(os.path.join(start_wd, clean_filename)): - tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf" - shutil.move(clean_filename, os.path.join(start_wd, tmpfname)) - else: - shutil.move(clean_filename, start_wd) - else: - upload_to_rm( - clean_filename, - remarkable_dir=args.remarkable_dir, - rmapi_path=args.rmapi, - ) + providers = [ + ArxivProvider, + PMCProvider, + ACMProvider, + LocalFileProvider, + PdfUrlProvider, + ] + + provider = next((p for p in providers if p.validate(args.input)), None) + if provider is None: + exception("Input not valid, no provider can handle this source.") + + prov = provider( + args.verbose, + not args.no_upload, + args.debug, + args.remarkable_dir, + args.rmapi, + args.pdfcrop, + args.pdftk, + args.gs, + ) + + prov.run(args.input, filename=args.filename) if __name__ == "__main__": diff --git a/poetry.lock b/poetry.lock index d8a1205..893007f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,16 +1,5 @@ [[package]] category = "main" -description = "Produce colored terminal text with an xml-like markup" -name = "ansimarkup" -optional = false -python-versions = "*" -version = "1.4.0" - -[package.dependencies] -colorama = "*" - -[[package]] -category = "main" description = "Screen-scraping library" name = "beautifulsoup4" optional = false @@ -22,19 +11,6 @@ soupsieve = ">=1.2" [[package]] category = "main" -description = "Pretty and helpful exceptions, automatically" -name = "better-exceptions-fork" -optional = false -python-versions = "*" -version = "0.2.1.post6" - -[package.dependencies] -ansimarkup = ">=1.3.0" -colorama = "*" -pygments = ">=2.2.0" - -[[package]] -category = "main" description = "Dummy package for Beautiful Soup" name = "bs4" optional = false @@ -62,14 +38,6 @@ version = "3.0.4" [[package]] category = "main" -description = "Cross-platform colored terminal text." -name = "colorama" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "0.4.1" - -[[package]] -category = "main" description = "Internationalized Domain Names in Applications (IDNA)" name = "idna" optional = false @@ -78,27 +46,6 @@ version = "2.8" [[package]] category = "main" -description = "Python logging made (stupidly) simple" -name = "loguru" -optional = false -python-versions = ">=3.5" -version = "0.2.5" - -[package.dependencies] -ansimarkup = ">=1.4.0" -better-exceptions-fork = ">=0.2.1.post6" -colorama = ">=0.3.4" - -[[package]] -category = "main" -description = "Pygments is a syntax highlighting package written in Python." -name = "pygments" -optional = false -python-versions = "*" -version = "2.3.1" - -[[package]] -category = "main" description = "Python HTTP for Humans." name = "requests" optional = false @@ -128,20 +75,15 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" version = "1.24.1" [metadata] -content-hash = "b92b4b1d2c4f9d3181044c1ad99fd9bfa49e8618c6ff5de7bd64c557bcc27e39" +content-hash = "bf39364b4d9cc98c89d858338ce8e7609e35fdd4a7e5cfe256f768c12ed7cb82" python-versions = "^3.5" [metadata.hashes] -ansimarkup = ["06365e3ef89a12734fc408b2449cb4642d5fe2e603e95e7296eff9e98a0fe0b4", "174d920481416cec8d5a707af542d6fba25a1df1c21d8996479c32ba453649a4"] beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"] -better-exceptions-fork = ["5f0983da51e956dbdaf8b9a3d10e2774b382ce6c6ff2e54685c33e2dbe8f1472"] bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"] certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"] chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"] -colorama = ["05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d", "f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"] idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"] -loguru = ["68297d9f23064c2f4764bb5d0c5c767f3ed7f9fc1218244841878f5fc7c94add", "ebac59630946721fd6207264679b267a8bdc290b086226067d6aad86830e3123"] -pygments = ["5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", "e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"] requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"] soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"] urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"] diff --git a/pyproject.toml b/pyproject.toml index 6f67ecd..2c28224 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ license = "MIT" python = "^3.5" bs4 = "^0.0.1" requests = "^2.21" -loguru = "^0.2.5" [tool.poetry.dev-dependencies] @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__author__ = "G.J.J. van den Burg" + +"""Tests""" + +import unittest +import tempfile +import hashlib +import shutil +import os + +from arxiv2remarkable import ( + ArxivProvider, + PMCProvider, + ACMProvider, + LocalFileProvider, + PdfUrlProvider, +) + + +def md5sum(filename): + blocksize = 65536 + hasher = hashlib.md5() + with open(filename, "rb") as fid: + buf = fid.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = fid.read(blocksize) + return hasher.hexdigest() + + +class Tests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_arxiv(self): + prov = ArxivProvider(upload=False) + url = "https://arxiv.org/abs/1811.11242v1" + exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(1054082 < fsize <= 1056082) + + def test_pmc(self): + prov = PMCProvider(upload=False) + url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" + exp_filename = ( + "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(376640 < fsize <= 378640) + + def test_acm(self): + prov = ACMProvider(upload=False) + url = "https://dl.acm.org/citation.cfm?id=3300356" + exp_filename = "Muller_et_al_-_How_Data_Science_Workers_Work_With_Data_Discovery_Capture_Curation_Design_Creation_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(1691444 < fsize <= 1693444) + + def test_local(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + prov = LocalFileProvider(upload=False) + filename = prov.run(local_filename) + self.assertEqual("test_.pdf", os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(5843 < fsize <= 7843) + + def test_pdfurl(self): + prov = PdfUrlProvider(upload=False) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + filename = prov.run(url, filename="test.pdf") + self.assertEqual("test.pdf", os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(1828169 < fsize <= 1830169) + +if __name__ == "__main__": + unittest.main() |
