From 92786bc2f898575f80845f9b7111baa3d386734a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 5 Apr 2019 18:15:10 +0100 Subject: [WIP] rewrite to use Providers --- arxiv2remarkable.py | 177 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 140 insertions(+), 37 deletions(-) diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 61a3667..a0b4a94 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -18,6 +18,7 @@ License: MIT """ +import abc import PyPDF2 import argparse import bs4 @@ -43,36 +44,135 @@ HEADERS = { } +class Provider(metaclass=abc.ABCMeta): + """ ABC for providers of pdf sources """ + + def __init__(self): + pass + + @staticmethod + @abc.abstractmethod + def validate(self, src): + """ Validate whether ``src`` is appropriate for this provider """ + + @abc.abstractmethod + def retrieve_pdf(self, src, filename): + """ Download pdf from src and save to filename """ + + @abc.abstractmethod + def get_paper_info(self, src): + """ Retrieve the title/author (surnames)/year information """ + + def create_filename(self, info, filename=None): + """ Generate filename using the info dict or filename if provided """ + if not filename is None: + return filename + # we assume that the list of authors is surname only. + logger.info("Generating output filename") + if len(info["authors"]) > 3: + author_part = info["authors"][0] + "_et_al" + else: + author_part = "_".join(info["authors"]) + author_part = author_part.replace(" ", "_") + title = ( + info["title"].replace(",", "").replace(":", "").replace(" ", "_") + ) + title_part = titlecase.titlecase(title) + year_part = info["date"].split("/")[0] + return author_part + "_-_" + title_part + "_" + year_part + ".pdf" + + def run(self, src, filename=None): + info = get_paper_info(src) + clean_filename = self.create_filename(info, filename) + tmp_filename = "paper.pdf" + self.retrieve_pdf(src, tmp_filename) + self.check_file_is_pdf(tmp_filename) + + ops = [self.dearxiv, self.crop, self.shrink] + intermediate_fname = tmp_filename + for op in ops: + intermediate_fname = op(tmp_filename) + shutil.move(intermediate_fname, clean_filename) + # TODO: here + + + + + + + + + +class ArxivProvider(Provider): + def __init__(self): + super().__init__() + self.abs_url = None + self.pdf_url = None + + def get_abs_pdf_urls(self, url): + """Get the pdf and abs url from any given arXiv url """ + if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match( + "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url + ): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return abs_url, pdf_url + + def validate(self, src): + """Check if the url is to an arXiv page. + + >>> validate_url("https://arxiv.org/abs/1811.11242") + True + >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf") + True + >>> validate_url("http://arxiv.org/abs/1811.11242") + True + >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf") + True + >>> validate_url("https://arxiv.org/abs/1811.11242v1") + True + >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf") + True + >>> validate_url("https://gertjanvandenburg.com") + False + """ + m = re.match( + "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src + ) + return not m is None + + def retrieve_pdf(self, src, filename): + """ Download the file and save as filename """ + _, pdf_url = self.get_abs_pdf_urls(src) + download_url(pdf_url, filename) + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + abs_url, _ = self.get_abs_pdf_urls(src) + logger.info("Getting paper info from arXiv") + page = get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_author"}) + ] + authors = [x.split(",")[0].strip() for x in authors] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + return dict(title=title, date=date, authors=authors) + + def exception(msg): print("ERROR: " + msg, file=sys.stderr) print("Error occurred. Exiting.", file=sys.stderr) raise SystemExit(1) -def arxiv_url(url): - """Check if the url is to an arXiv page. - - >>> validate_url("https://arxiv.org/abs/1811.11242") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("http://arxiv.org/abs/1811.11242") - True - >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("https://arxiv.org/abs/1811.11242v1") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf") - True - >>> validate_url("https://gertjanvandenburg.com") - False - """ - m = re.match( - "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url - ) - return not m is None - - def pmc_url(url): m = re.fullmatch( "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url @@ -101,19 +201,6 @@ def check_file_is_pdf(filename): return False -def get_arxiv_urls(url): - """Get the pdf and abs url from any given arXiv url """ - if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url): - abs_url = url - pdf_url = url.replace("abs", "pdf") + ".pdf" - elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url): - abs_url = url[:-4].replace("pdf", "abs") - pdf_url = url - else: - exception("Couldn't figure out arXiv urls.") - return pdf_url, abs_url - - def get_pmc_urls(url): """Get the pdf and html url from a given PMC url """ if re.match( @@ -420,6 +507,22 @@ def parse_args(): return parser.parse_args() +@logger.catch +def newmain(): + args = parse_args() + + provider = next((p for p in providers if p.validate(args.input)), None) + if provider is None: + exception("Input not valid, no provider can handle this source.") + + if not args.verbose: + logger.remove(0) + + start_wd = os.getcwd() + with tempfile.TemporaryDirector() as working_dir: + provider.run(args.input) + + @logger.catch def main(): args = parse_args() -- cgit v1.2.3 From 6077055c6b736084f65a6dadad846c9a637a6e56 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 7 Apr 2019 21:26:55 +0100 Subject: log filename created --- arxiv2remarkable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 2b12919..ccd1621 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -282,7 +282,9 @@ def generate_filename(info): title = info["title"].replace(",", "").replace(":", "").replace(" ", "_") title_part = titlecase.titlecase(title) year_part = info["date"].split("/")[0] - return author_part + "_-_" + title_part + "_" + year_part + ".pdf" + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" + logger.info("Created filename: %s" % name) + return name def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"): -- cgit v1.2.3 From 66a9a4d63068fb32e12f61d1fe7612586d6dc30e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 21 May 2019 10:40:38 -0400 Subject: more work in progress --- arxiv2remarkable.py | 479 ++++++++++++++++++++++++++++------------------------ 1 file changed, 254 insertions(+), 225 deletions(-) diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 5152018..9686ee1 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +__version__ = "0.2.0" +__author__ = "G.J.J. van den Burg" + """ Given an arXiv paper url this script: @@ -47,8 +50,9 @@ HEADERS = { class Provider(metaclass=abc.ABCMeta): """ ABC for providers of pdf sources """ - def __init__(self): - pass + def __init__(self, remarkable_dir="/", rmapi_path="rmapi"): + self.remarkable_dir = remarkable_dir + self.rmapi_path = rmapi_path @staticmethod @abc.abstractmethod @@ -79,10 +83,78 @@ class Provider(metaclass=abc.ABCMeta): ) title_part = titlecase.titlecase(title) year_part = info["date"].split("/")[0] - return author_part + "_-_" + title_part + "_" + year_part + ".pdf" + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" + logger.info("Created filename: %s" % name) + return name + + def crop_pdf(self, filepath): + logger.info("Cropping pdf file") + status = subprocess.call( + [self.pdfcrop_path, "--margins", "15 40 15 15", filepath], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + logger.warning("Failed to crop the pdf file at: %s" % filepath) + return filepath + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + if not os.path.exists(cropped_file): + logger.warning( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file + + def shrink_pdf(self, filepath): + logger.info("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + logger.warning("Failed to shrink the pdf file") + return filepath + return output_file - def run(self, src, filename=None): - info = get_paper_info(src) + def check_file_is_pdf(self, filename): + try: + PyPDF2.PdfFileReader(open(filename, "rb")) + return True + except PyPDF2.utils.PdfReadError: + exception("Downloaded file isn't a valid pdf file.") + + def upload_to_rm(self, filepath): + remarkable_dir = self.remarkable_dir.rstrip("/") + logger.info("Starting upload to reMarkable") + if remarkable_dir: + status = subprocess.call( + [self.rmapi_path, "mkdir", remarkable_dir], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception( + "Creating directory %s on reMarkable failed" + % remarkable_dir + ) + status = subprocess.call( + [self.rmapi_path, "put", filepath, remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception("Uploading file %s to reMarkable failed" % filepath) + logger.info("Upload successful.") + + def run(self, src, filename=None, debug=False, upload=True): + info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) tmp_filename = "paper.pdf" self.retrieve_pdf(src, tmp_filename) @@ -93,21 +165,24 @@ class Provider(metaclass=abc.ABCMeta): for op in ops: intermediate_fname = op(tmp_filename) shutil.move(intermediate_fname, clean_filename) - # TODO: here - - - - - + if debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + if upload: + return self.upload_to_rm(clean_filename) + if os.path.exists(os.path.join(start_wd, clean_filename)): + tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf" + shutil.move(clean_filename, os.path.join(start_wd, tmpfname)) + else: + shutil.move(clean_filename, start_wd) class ArxivProvider(Provider): - def __init__(self): - super().__init__() - self.abs_url = None - self.pdf_url = None + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ @@ -124,23 +199,7 @@ class ArxivProvider(Provider): return abs_url, pdf_url def validate(self, src): - """Check if the url is to an arXiv page. - - >>> validate_url("https://arxiv.org/abs/1811.11242") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("http://arxiv.org/abs/1811.11242") - True - >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("https://arxiv.org/abs/1811.11242v1") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf") - True - >>> validate_url("https://gertjanvandenburg.com") - False - """ + """Check if the url is to an arXiv page. """ m = re.match( "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src ) @@ -166,87 +225,165 @@ class ArxivProvider(Provider): date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] return dict(title=title, date=date, authors=authors) +class PMCProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) -def exception(msg): - print("ERROR: " + msg, file=sys.stderr) - print("Error occurred. Exiting.", file=sys.stderr) - raise SystemExit(1) + def get_abs_pdf_urls(self, url): + """Get the pdf and html url from a given PMC url """ + if re.match( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf", + url, + ): + idx = url.index("pdf") + abs_url = url[: idx - 1] + pdf_url = url + elif re.match( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url + ): + abs_url = url + pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually + else: + exception("Couldn't figure out PMC urls.") + return pdf_url, abs_url + def validate(self, src): + m = re.fullmatch( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src + ) + return not m is None -def pmc_url(url): - m = re.fullmatch( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url - ) - return not m is None - - -def acm_url(url): - m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url) - return not m is None - - -def valid_url(url): - try: - result = urllib.parse.urlparse(url) - return all([result.scheme, result.netloc, result.path]) - except: - return False - - -def check_file_is_pdf(filename): - try: - PyPDF2.PdfFileReader(open(filename, "rb")) - return True - except PyPDF2.utils.PdfReadError: - return False - - -def get_pmc_urls(url): - """Get the pdf and html url from a given PMC url """ - if re.match( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf", - url, - ): - idx = url.index("pdf") - abs_url = url[: idx - 1] - pdf_url = url - elif re.match("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url): - abs_url = url - pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually - else: - exception("Couldn't figure out PMC urls.") - return pdf_url, abs_url - - -def get_acm_pdf_url(url): - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - thea = None - for a in soup.find_all("a"): - if a.get("name") == "FullTextPDF": - thea = a - break - if thea is None: + def retrieve_pdf(self, src, filename): + _, pdf_url = self.get_abs_pdf_urls(src) + download_url(pdf_url, filename) + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + logger.info("Getting paper info from PMC") + page = get_page_with_retry(src) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_authors"}) + ] + # We only use last names, and this method is a guess at best. I'm open to + # more advanced approaches. + authors = [ + x.strip().split(" ")[-1].strip() for x in authors[0].split(",") + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + if re.match("\w+\ \d{4}", date): + date = date.split(" ")[-1] + else: + date = date.replace(" ", "_") + return dict(title=title, date=date, authors=authors) + +class ACMProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_acm_pdf_url(self, url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + thea = None + for a in soup.find_all("a"): + if a.get("name") == "FullTextPDF": + thea = a + break + if thea is None: + return None + href = thea.get("href") + if href.startswith("http"): + return href + else: + return "https://dl.acm.org/" + href + + def get_abs_pdf_urls(self, url): + if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url): + abs_url = url + pdf_url = self.get_acm_pdf_url(url) + if pdf_url is None: + exception("Couldn't extract PDF url from ACM citation page.") + else: + exception( + "Couldn't figure out ACM urls, please provide a URL of the " + "format: http(s)://dl.acm.org/citation.cfm?id=..." + ) + return pdf_url, abs_url + + def retrieve_pdf(self, src, filename): + _, pdf_url = self.get_abs_pdf_urls(src) + download_url(pdf_url, filename) + + def validate(self, src): + m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src) + return not m is None + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + logger.info("Getting paper info from ACM") + page = get_page_with_retry(src) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_authors"}) + ] + # We only use last names, and this method is a guess. I'm open to more + # advanced approaches. + authors = [ + x.strip().split(",")[0].strip() for x in authors[0].split(";") + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): + logger.warning( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so I can fix it: %s", + GITHUB_URL, + ) + date = date.strip().split("/")[-1] + return dict(title=title, date=date, authors=authors) + +class LocalFileProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def validate(self, src): + return os.path.exists(src) + + def retrieve_pdf(self, src, filename): + shutil.copy(src, filename) + + def get_paper_info(self, src): return None - href = thea.get("href") - if href.startswith("http"): - return href - else: - return "https://dl.acm.org/" + href +class PdfUrlProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) -def get_acm_urls(url): - if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url): - abs_url = url - pdf_url = get_acm_pdf_url(url) - if pdf_url is None: - exception("Couldn't extract PDF url from ACM citation page.") - else: - exception( - "Couldn't figure out ACM urls, please provide a URL of the " - "format: http(s)://dl.acm.org/citation.cfm?id=..." - ) - return pdf_url, abs_url + def validate(self, src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False + + def retrieve_pdf(self, url, filename): + if filename is None: + exception( + "Filename must be provided with pdf url (use --filename)" + ) + download_url(url, filename) + + def get_paper_info(self, src): + return None + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + raise SystemExit(1) def get_page_with_retry(url): @@ -325,123 +462,6 @@ def dearxiv(input_file, pdftk_path="pdftk"): return output_file -def crop_pdf(filepath, pdfcrop_path="pdfcrop"): - logger.info("Cropping pdf file") - status = subprocess.call( - [pdfcrop_path, "--margins", "15 40 15 15", filepath], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - logger.warning("Failed to crop the pdf file at: %s" % filepath) - return filepath - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - if not os.path.exists(cropped_file): - logger.warning( - "Can't find cropped file '%s' where expected." % cropped_file - ) - return filepath - return cropped_file - - -def shrink_pdf(filepath, gs_path="gs"): - logger.info("Shrinking pdf file") - output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" - status = subprocess.call( - [ - "gs", - "-sDEVICE=pdfwrite", - "-dCompatibilityLevel=1.4", - "-dPDFSETTINGS=/printer", - "-dNOPAUSE", - "-dBATCH", - "-dQUIET", - "-sOutputFile=%s" % output_file, - filepath, - ] - ) - if not status == 0: - logger.warning("Failed to shrink the pdf file") - return filepath - return output_file - - -def get_paper_info_arxiv(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from arXiv") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_author"}) - ] - authors = [x.split(",")[0].strip() for x in authors] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - return dict(title=title, date=date, authors=authors) - - -def get_paper_info_pmc(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from PMC") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_authors"}) - ] - # We only use last names, and this method is a guess at best. I'm open to - # more advanced approaches. - authors = [x.strip().split(" ")[-1].strip() for x in authors[0].split(",")] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - if re.match("\w+\ \d{4}", date): - date = date.split(" ")[-1] - else: - date = date.replace(" ", "_") - return dict(title=title, date=date, authors=authors) - - -def get_paper_info_acm(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from ACM") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_authors"}) - ] - # We only use last names, and this method is a guess. I'm open to more - # advanced approaches. - authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): - logger.warning( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so I can fix it: %s", - GITHUB_URL, - ) - date = date.strip().split("/")[-1] - return dict(title=title, date=date, authors=authors) - - -def generate_filename(info): - """ Generate a nice filename for a paper given the info dict """ - # we assume that the list of authors is lastname only. - logger.info("Generating output filename") - if len(info["authors"]) > 3: - author_part = info["authors"][0] + "_et_al" - else: - author_part = "_".join(info["authors"]) - author_part = author_part.replace(" ", "_") - title = info["title"].replace(",", "").replace(":", "").replace(" ", "_") - title_part = titlecase.titlecase(title) - year_part = info["date"].split("/")[0] - name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" - logger.info("Created filename: %s" % name) - return name - - def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"): remarkable_dir = remarkable_dir.rstrip("/") logger.info("Starting upload to reMarkable") @@ -513,6 +533,14 @@ def parse_args(): def newmain(): args = parse_args() + providers = [ + ArxivProvider, + PMCProvider, + ACMProvider, + LocalFileProvider, + PdfUrlProvider, + ] + provider = next((p for p in providers if p.validate(args.input)), None) if provider is None: exception("Input not valid, no provider can handle this source.") @@ -520,9 +548,10 @@ def newmain(): if not args.verbose: logger.remove(0) + start_wd = os.getcwd() with tempfile.TemporaryDirector() as working_dir: - provider.run(args.input) + provider.run(args.input, debug=args.debug, upload=not args.no_upload) @logger.catch -- cgit v1.2.3 From 42fd83b66037bd7c714a67d4f22d38eda478ecb8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 29 May 2019 23:11:02 +0100 Subject: even more work in progress --- arxiv2remarkable.py | 473 ++++++++++++++++++++++++---------------------------- 1 file changed, 220 insertions(+), 253 deletions(-) diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 9686ee1..8a8d58b 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -21,10 +21,11 @@ License: MIT """ -import abc import PyPDF2 +import abc import argparse import bs4 +import datetime import os import re import requests @@ -36,8 +37,6 @@ import time import titlecase import urllib.parse -from loguru import logger - GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" HEADERS = { @@ -50,13 +49,45 @@ HEADERS = { class Provider(metaclass=abc.ABCMeta): """ ABC for providers of pdf sources """ - def __init__(self, remarkable_dir="/", rmapi_path="rmapi"): + def __init__( + self, + verbose=False, + upload=True, + debug=False, + remarkable_dir="/", + rmapi_path="rmapi", + pdfcrop_path="pdfcrop", + pdftk_path="pdftk", + gs_path="gs", + ): + self.verbose = verbose + self.upload = upload + self.debug = debug self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path + self.pdfcrop_path = pdfcrop_path + self.pdftk_path = pdftk_path + self.gs_path = gs_path + + self.log("Starting %s" % type(self).__name__) + + def log(self, msg, mode="info"): + if not self.verbose: + return + if not mode in ["info", "warning"]: + raise ValueError("unknown logging mode.") + now = datetime.datetime.now() + print( + now.strftime("%Y-%m-%d %H:%M:%S") + + " - " + + mode.upper() + + " - " + + msg + ) @staticmethod @abc.abstractmethod - def validate(self, src): + def validate(src): """ Validate whether ``src`` is appropriate for this provider """ @abc.abstractmethod @@ -72,40 +103,41 @@ class Provider(metaclass=abc.ABCMeta): if not filename is None: return filename # we assume that the list of authors is surname only. - logger.info("Generating output filename") + self.log("Generating output filename") if len(info["authors"]) > 3: author_part = info["authors"][0] + "_et_al" else: author_part = "_".join(info["authors"]) author_part = author_part.replace(" ", "_") - title = ( - info["title"].replace(",", "").replace(":", "").replace(" ", "_") - ) - title_part = titlecase.titlecase(title) + title = info["title"].replace(",", "").replace(":", "") + title_part = titlecase.titlecase(title).replace(" ", "_") year_part = info["date"].split("/")[0] name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" - logger.info("Created filename: %s" % name) + self.log("Created filename: %s" % name) return name def crop_pdf(self, filepath): - logger.info("Cropping pdf file") + self.log("Cropping pdf file") status = subprocess.call( [self.pdfcrop_path, "--margins", "15 40 15 15", filepath], stdout=subprocess.DEVNULL, ) if not status == 0: - logger.warning("Failed to crop the pdf file at: %s" % filepath) + self.log( + "Failed to crop the pdf file at: %s" % filepath, mode="warning" + ) return filepath cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" if not os.path.exists(cropped_file): - logger.warning( - "Can't find cropped file '%s' where expected." % cropped_file + self.log( + "Can't find cropped file '%s' where expected." % cropped_file, + mode="warning", ) return filepath return cropped_file def shrink_pdf(self, filepath): - logger.info("Shrinking pdf file") + self.log("Shrinking pdf file") output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" status = subprocess.call( [ @@ -121,20 +153,58 @@ class Provider(metaclass=abc.ABCMeta): ] ) if not status == 0: - logger.warning("Failed to shrink the pdf file") + self.log("Failed to shrink the pdf file", mode="warning") return filepath return output_file def check_file_is_pdf(self, filename): try: - PyPDF2.PdfFileReader(open(filename, "rb")) + fp = open(filename, "rb") + pdf = PyPDF2.PdfFileReader(fp, strict=False) + fp.close() + del pdf return True except PyPDF2.utils.PdfReadError: exception("Downloaded file isn't a valid pdf file.") + def download_url(self, url, filename): + """Download the content of an url and save it to a filename """ + self.log("Downloading file at url: %s" % url) + content = self.get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + def get_page_with_retry(self, url, times=5): + """ Get the content of an url, retrying on failure. + """ + + def retry(url, count): + if count < times: + self.log( + "Caught error for url %s. Retrying in 5 seconds." % url, + mode="warning", + ) + time.sleep(5) + else: + exception("Failed to download url: %s" % url) + + count = 0 + while True: + count += 1 + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + retry(url, count) + continue + if res.ok: + self.log("Downloading url: %s" % url) + return res.content + else: + retry(url, count) + def upload_to_rm(self, filepath): remarkable_dir = self.remarkable_dir.rstrip("/") - logger.info("Starting upload to reMarkable") + self.log("Starting upload to reMarkable") if remarkable_dir: status = subprocess.call( [self.rmapi_path, "mkdir", remarkable_dir], @@ -151,34 +221,86 @@ class Provider(metaclass=abc.ABCMeta): ) if not status == 0: exception("Uploading file %s to reMarkable failed" % filepath) - logger.info("Upload successful.") + self.log("Upload successful.") - def run(self, src, filename=None, debug=False, upload=True): + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + self.log("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file + + def run(self, src, filename=None): info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) tmp_filename = "paper.pdf" - self.retrieve_pdf(src, tmp_filename) - self.check_file_is_pdf(tmp_filename) - - ops = [self.dearxiv, self.crop, self.shrink] - intermediate_fname = tmp_filename - for op in ops: - intermediate_fname = op(tmp_filename) - shutil.move(intermediate_fname, clean_filename) - - if debug: - print("Paused in debug mode in dir: %s" % working_dir) - print("Press enter to exit.") - return input() - - if upload: - return self.upload_to_rm(clean_filename) - - if os.path.exists(os.path.join(start_wd, clean_filename)): - tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf" - shutil.move(clean_filename, os.path.join(start_wd, tmpfname)) - else: - shutil.move(clean_filename, start_wd) + + self.initial_dir = os.getcwd() + with tempfile.TemporaryDirectory() as working_dir: + os.chdir(working_dir) + self.retrieve_pdf(src, tmp_filename) + self.check_file_is_pdf(tmp_filename) + + ops = [self.dearxiv, self.crop_pdf, self.shrink_pdf] + intermediate_fname = tmp_filename + for op in ops: + intermediate_fname = op(intermediate_fname) + shutil.move(intermediate_fname, clean_filename) + + if self.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if self.upload: + return self.upload_to_rm(clean_filename) + + target_path = os.path.join(self.initial_dir, clean_filename) + while os.path.exists(target_path): + base = os.path.splitext(target_path)[0] + target_path = base + "_.pdf" + shutil.move(clean_filename, target_path) + return target_path + class ArxivProvider(Provider): def __init__(self, *args, **kwargs): @@ -198,7 +320,7 @@ class ArxivProvider(Provider): exception("Couldn't figure out arXiv urls.") return abs_url, pdf_url - def validate(self, src): + def validate(src): """Check if the url is to an arXiv page. """ m = re.match( "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src @@ -208,13 +330,13 @@ class ArxivProvider(Provider): def retrieve_pdf(self, src, filename): """ Download the file and save as filename """ _, pdf_url = self.get_abs_pdf_urls(src) - download_url(pdf_url, filename) + self.download_url(pdf_url, filename) def get_paper_info(self, src): """ Extract the paper's authors, title, and publication year """ abs_url, _ = self.get_abs_pdf_urls(src) - logger.info("Getting paper info from arXiv") - page = get_page_with_retry(abs_url) + self.log("Getting paper info from arXiv") + page = self.get_page_with_retry(abs_url) soup = bs4.BeautifulSoup(page, "html.parser") authors = [ x["content"] @@ -225,6 +347,7 @@ class ArxivProvider(Provider): date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] return dict(title=title, date=date, authors=authors) + class PMCProvider(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -245,9 +368,9 @@ class PMCProvider(Provider): pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually else: exception("Couldn't figure out PMC urls.") - return pdf_url, abs_url + return abs_url, pdf_url - def validate(self, src): + def validate(src): m = re.fullmatch( "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src ) @@ -255,12 +378,12 @@ class PMCProvider(Provider): def retrieve_pdf(self, src, filename): _, pdf_url = self.get_abs_pdf_urls(src) - download_url(pdf_url, filename) + self.download_url(pdf_url, filename) def get_paper_info(self, src): """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from PMC") - page = get_page_with_retry(src) + self.log("Getting paper info from PMC") + page = self.get_page_with_retry(src) soup = bs4.BeautifulSoup(page, "html.parser") authors = [ x["content"] @@ -279,12 +402,13 @@ class PMCProvider(Provider): date = date.replace(" ", "_") return dict(title=title, date=date, authors=authors) + class ACMProvider(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def get_acm_pdf_url(self, url): - page = get_page_with_retry(url) + page = self.get_page_with_retry(url) soup = bs4.BeautifulSoup(page, "html.parser") thea = None for a in soup.find_all("a"): @@ -304,26 +428,28 @@ class ACMProvider(Provider): abs_url = url pdf_url = self.get_acm_pdf_url(url) if pdf_url is None: - exception("Couldn't extract PDF url from ACM citation page.") + exception( + "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" + ) else: exception( "Couldn't figure out ACM urls, please provide a URL of the " "format: http(s)://dl.acm.org/citation.cfm?id=..." ) - return pdf_url, abs_url + return abs_url, pdf_url def retrieve_pdf(self, src, filename): _, pdf_url = self.get_abs_pdf_urls(src) - download_url(pdf_url, filename) + self.download_url(pdf_url, filename) - def validate(self, src): + def validate(src): m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src) return not m is None def get_paper_info(self, src): """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from ACM") - page = get_page_with_retry(src) + self.log("Getting paper info from ACM") + page = self.get_page_with_retry(src) soup = bs4.BeautifulSoup(page, "html.parser") authors = [ x["content"] @@ -337,32 +463,40 @@ class ACMProvider(Provider): title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): - logger.warning( + self.log( "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so I can fix it: %s", - GITHUB_URL, + "issue on GitHub so I can fix it: %s" % GITHUB_URL, + mode="warning", ) date = date.strip().split("/")[-1] return dict(title=title, date=date, authors=authors) + class LocalFileProvider(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def validate(self, src): + def validate(src): return os.path.exists(src) def retrieve_pdf(self, src, filename): - shutil.copy(src, filename) + source = os.path.join(self.initial_dir, src) + shutil.copy(source, filename) def get_paper_info(self, src): - return None + return {"filename": src} + + def create_filename(self, info, filename=None): + if not filename is None: + return filename + return os.path.basename(info["filename"]) + class PdfUrlProvider(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def validate(self, src): + def validate(src): try: result = urllib.parse.urlparse(src) return all([result.scheme, result.netloc, result.path]) @@ -370,15 +504,18 @@ class PdfUrlProvider(Provider): return False def retrieve_pdf(self, url, filename): - if filename is None: - exception( - "Filename must be provided with pdf url (use --filename)" - ) - download_url(url, filename) + self.download_url(url, filename) def get_paper_info(self, src): return None + def create_filename(self, info, filename=None): + if filename is None: + exception( + "Filename must be provided with PDFUrlProvider (use --filename)" + ) + return filename + def exception(msg): print("ERROR: " + msg, file=sys.stderr) @@ -386,102 +523,6 @@ def exception(msg): raise SystemExit(1) -def get_page_with_retry(url): - """Get the content of an url, retrying up to five times on failure. """ - - def retry(url, count): - if count < 5: - logger.info( - "Caught error for url %s. Retrying in 5 seconds." % url - ) - time.sleep(5) - else: - exception("Failed to download url: %s" % url) - - count = 0 - while True: - count += 1 - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - retry(url, count) - continue - if res.ok: - logger.info("Downloading url: %s" % url) - return res.content - else: - retry(url, count) - - -def download_url(url, filename): - """Download the content of an url and save it to a filename """ - logger.info("Downloading file at url: %s" % url) - content = get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) - - -def dearxiv(input_file, pdftk_path="pdftk"): - """Remove the arXiv timestamp from a pdf""" - logger.info("Removing arXiv timestamp") - basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - - status = subprocess.call( - [pdftk_path, input_file, "output", uncompress_file, "uncompress"] - ) - if not status == 0: - exception("pdftk failed to uncompress the pdf.") - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) - - removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) - - output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - exception("pdftk failed to compress the pdf.") - - return output_file - - -def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"): - remarkable_dir = remarkable_dir.rstrip("/") - logger.info("Starting upload to reMarkable") - if remarkable_dir: - status = subprocess.call( - [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL - ) - if not status == 0: - exception( - "Creating directory %s on reMarkable failed" % remarkable_dir - ) - status = subprocess.call( - [rmapi_path, "put", filepath, remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception("Uploading file %s to reMarkable failed" % filepath) - logger.info("Upload successful.") - - def parse_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter @@ -529,8 +570,7 @@ def parse_args(): return parser.parse_args() -@logger.catch -def newmain(): +def main(): args = parse_args() providers = [ @@ -545,91 +585,18 @@ def newmain(): if provider is None: exception("Input not valid, no provider can handle this source.") - if not args.verbose: - logger.remove(0) - - - start_wd = os.getcwd() - with tempfile.TemporaryDirector() as working_dir: - provider.run(args.input, debug=args.debug, upload=not args.no_upload) - - -@logger.catch -def main(): - args = parse_args() + prov = provider( + args.verbose, + not args.no_upload, + args.debug, + args.remarkable_dir, + args.rmapi, + args.pdfcrop, + args.pdftk, + args.gs, + ) - if os.path.exists(args.input): - mode = "local_file" - elif arxiv_url(args.input): - mode = "arxiv_url" - elif pmc_url(args.input): - mode = "pmc_url" - elif acm_url(args.input): - mode = "acm_url" - elif valid_url(args.input): - if args.filename is None: - exception( - "Filename must be provided with pdf url (use --filename)" - ) - mode = "pdf_url" - else: - exception("Input not a valid url, arxiv url, or existing file.") - - if not args.verbose: - logger.remove(0) - - start_wd = os.getcwd() - - with tempfile.TemporaryDirectory() as working_dir: - if mode == "local_file": - shutil.copy(args.input, working_dir) - filename = os.path.basename(args.input) - clean_filename = args.filename if args.filename else filename - - os.chdir(working_dir) - if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]: - filename = "paper.pdf" - if mode == "arxiv_url": - pdf_url, abs_url = get_arxiv_urls(args.input) - paper_info = get_paper_info_arxiv(abs_url) - elif mode == "pmc_url": - pdf_url, abs_url = get_pmc_urls(args.input) - paper_info = get_paper_info_pmc(abs_url) - elif mode == "acm_url": - pdf_url, abs_url = get_acm_urls(args.input) - paper_info = get_paper_info_acm(abs_url) - else: - pdf_url = args.input - download_url(pdf_url, filename) - if not check_file_is_pdf(filename): - exception("Downloaded file isn't a valid pdf file.") - if args.filename: - clean_filename = args.filename - else: - clean_filename = generate_filename(paper_info) - - dearxived = dearxiv(filename, pdftk_path=args.pdftk) - cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop) - shrinked = shrink_pdf(cropped) - shutil.move(shrinked, clean_filename) - - if args.debug: - print("Paused in debug mode in dir: %s" % working_dir) - print("Press enter to exit.") - return input() - - if args.no_upload: - if os.path.exists(os.path.join(start_wd, clean_filename)): - tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf" - shutil.move(clean_filename, os.path.join(start_wd, tmpfname)) - else: - shutil.move(clean_filename, start_wd) - else: - upload_to_rm( - clean_filename, - remarkable_dir=args.remarkable_dir, - rmapi_path=args.rmapi, - ) + prov.run(args.input, filename=args.filename) if __name__ == "__main__": -- cgit v1.2.3 From 2fd2c4e12aed954c1df930db76594a9e6ef5a97d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 29 May 2019 23:11:09 +0100 Subject: add tests --- test.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..2ec59d8 --- /dev/null +++ b/test.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__author__ = "G.J.J. van den Burg" + +"""Tests""" + +import unittest +import tempfile +import hashlib +import shutil +import os + +from arxiv2remarkable import ( + ArxivProvider, + PMCProvider, + ACMProvider, + LocalFileProvider, + PdfUrlProvider, +) + + +def md5sum(filename): + blocksize = 65536 + hasher = hashlib.md5() + with open(filename, "rb") as fid: + buf = fid.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = fid.read(blocksize) + return hasher.hexdigest() + + +class Tests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_arxiv(self): + prov = ArxivProvider(upload=False) + url = "https://arxiv.org/abs/1811.11242v1" + exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(1054082 < fsize <= 1056082) + + def test_pmc(self): + prov = PMCProvider(upload=False) + url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" + exp_filename = ( + "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(376640 < fsize <= 378640) + + def test_acm(self): + prov = ACMProvider(upload=False) + url = "https://dl.acm.org/citation.cfm?id=3300356" + exp_filename = "Muller_et_al_-_How_Data_Science_Workers_Work_With_Data_Discovery_Capture_Curation_Design_Creation_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(1691444 < fsize <= 1693444) + + def test_local(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + prov = LocalFileProvider(upload=False) + filename = prov.run(local_filename) + self.assertEqual("test_.pdf", os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(5843 < fsize <= 7843) + + def test_pdfurl(self): + prov = PdfUrlProvider(upload=False) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + filename = prov.run(url, filename="test.pdf") + self.assertEqual("test.pdf", os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(1828169 < fsize <= 1830169) + +if __name__ == "__main__": + unittest.main() -- cgit v1.2.3 From 9ec285cc584e238ea15d669ec346ecd1db0fd68e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 30 May 2019 00:42:58 +0100 Subject: Update Readme --- README.md | 24 +++++++++++++----------- arxiv2remarkable.py | 10 +--------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 3b7be2a..3c4f9b9 100644 --- a/README.md +++ b/README.md @@ -57,16 +57,18 @@ And here's an example with verbose mode enabled that shows everything the script does: ```bash $ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242 -2019-02-03 18:11:41.816 | INFO | __main__:download_url:106 - Downloading file at url: https://arxiv.org/pdf/1811.11242v1.pdf -2019-02-03 18:11:46.833 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/pdf/1811.11242v1.pdf -2019-02-03 18:11:46.835 | INFO | __main__:get_paper_info:194 - Getting paper info from arXiv -2019-02-03 18:11:47.496 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/abs/1811.11242v1 -2019-02-03 18:11:47.508 | INFO | __main__:generate_filename:206 - Generating output filename -2019-02-03 18:11:47.508 | INFO | __main__:dearxiv:114 - Removing arXiv timestamp -2019-02-03 18:11:49.221 | INFO | __main__:crop_pdf:154 - Cropping pdf file -2019-02-03 18:11:53.247 | INFO | __main__:shrink_pdf:172 - Shrinking pdf file -2019-02-03 18:11:54.802 | INFO | __main__:upload_to_rm:218 - Starting upload to reMarkable -2019-02-03 18:11:57.767 | INFO | __main__:upload_to_rm:223 - Upload successful. +2019-05-30 00:38:27 - INFO - Starting ArxivProvider +2019-05-30 00:38:27 - INFO - Getting paper info from arXiv +2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242 +2019-05-30 00:38:27 - INFO - Generating output filename +2019-05-30 00:38:27 - INFO - Created filename: Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf +2019-05-30 00:38:27 - INFO - Downloading file at url: https://arxiv.org/pdf/1811.11242.pdf +2019-05-30 00:38:32 - INFO - Downloading url: https://arxiv.org/pdf/1811.11242.pdf +2019-05-30 00:38:32 - INFO - Removing arXiv timestamp +2019-05-30 00:38:34 - INFO - Cropping pdf file +2019-05-30 00:38:37 - INFO - Shrinking pdf file +2019-05-30 00:38:38 - INFO - Starting upload to reMarkable +2019-05-30 00:38:42 - INFO - Upload successful. ``` ## Dependencies @@ -79,7 +81,7 @@ The script requires the following external programs to be available: - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) -If these scripts are not available on the PATH variable, you can supply them +If these scripts are not available on the ``PATH`` variable, you can supply them with the relevant options to the script. The script also needs the following Python packages: diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 8a8d58b..b3abc3c 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -5,15 +5,7 @@ __version__ = "0.2.0" __author__ = "G.J.J. van den Burg" """ -Given an arXiv paper url this script: - -1. Downloads the paper -2. Strips the timestamp -3. Crops the pdf to remove unnecessary borders -4. Shrinks the pdf to reduce the filesize -5. Renames it using the format: - '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf -6. Uploads it to the reMarkable using rMapi. +Download a paper from various sources and send it to the reMarkable. Author: G.J.J. van den Burg Date: 2019-02-02 -- cgit v1.2.3 From d22efb98c201661e27ab4225164c1ef491de77f2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 30 May 2019 00:43:11 +0100 Subject: Remove loguru dependency --- README.md | 3 +-- poetry.lock | 60 +--------------------------------------------------------- pyproject.toml | 1 - 3 files changed, 2 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index 3c4f9b9..bbda5a7 100644 --- a/README.md +++ b/README.md @@ -88,14 +88,13 @@ The script also needs the following Python packages: - [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML - [requests](https://pypi.org/project/requests/): getting HTML -- [loguru](https://pypi.org/project/loguru/): easy logging - [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF - [titlecase](https://pypi.org/project/titlecase/): fancy titles You can use this line: ```bash -pip install --user bs4 requests loguru PyPDF2 titlecase +pip install --user bs4 requests PyPDF2 titlecase ``` # Notes diff --git a/poetry.lock b/poetry.lock index d8a1205..893007f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,14 +1,3 @@ -[[package]] -category = "main" -description = "Produce colored terminal text with an xml-like markup" -name = "ansimarkup" -optional = false -python-versions = "*" -version = "1.4.0" - -[package.dependencies] -colorama = "*" - [[package]] category = "main" description = "Screen-scraping library" @@ -20,19 +9,6 @@ version = "4.7.1" [package.dependencies] soupsieve = ">=1.2" -[[package]] -category = "main" -description = "Pretty and helpful exceptions, automatically" -name = "better-exceptions-fork" -optional = false -python-versions = "*" -version = "0.2.1.post6" - -[package.dependencies] -ansimarkup = ">=1.3.0" -colorama = "*" -pygments = ">=2.2.0" - [[package]] category = "main" description = "Dummy package for Beautiful Soup" @@ -60,14 +36,6 @@ optional = false python-versions = "*" version = "3.0.4" -[[package]] -category = "main" -description = "Cross-platform colored terminal text." -name = "colorama" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "0.4.1" - [[package]] category = "main" description = "Internationalized Domain Names in Applications (IDNA)" @@ -76,27 +44,6 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" version = "2.8" -[[package]] -category = "main" -description = "Python logging made (stupidly) simple" -name = "loguru" -optional = false -python-versions = ">=3.5" -version = "0.2.5" - -[package.dependencies] -ansimarkup = ">=1.4.0" -better-exceptions-fork = ">=0.2.1.post6" -colorama = ">=0.3.4" - -[[package]] -category = "main" -description = "Pygments is a syntax highlighting package written in Python." -name = "pygments" -optional = false -python-versions = "*" -version = "2.3.1" - [[package]] category = "main" description = "Python HTTP for Humans." @@ -128,20 +75,15 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" version = "1.24.1" [metadata] -content-hash = "b92b4b1d2c4f9d3181044c1ad99fd9bfa49e8618c6ff5de7bd64c557bcc27e39" +content-hash = "bf39364b4d9cc98c89d858338ce8e7609e35fdd4a7e5cfe256f768c12ed7cb82" python-versions = "^3.5" [metadata.hashes] -ansimarkup = ["06365e3ef89a12734fc408b2449cb4642d5fe2e603e95e7296eff9e98a0fe0b4", "174d920481416cec8d5a707af542d6fba25a1df1c21d8996479c32ba453649a4"] beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"] -better-exceptions-fork = ["5f0983da51e956dbdaf8b9a3d10e2774b382ce6c6ff2e54685c33e2dbe8f1472"] bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"] certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"] chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"] -colorama = ["05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d", "f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"] idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"] -loguru = ["68297d9f23064c2f4764bb5d0c5c767f3ed7f9fc1218244841878f5fc7c94add", "ebac59630946721fd6207264679b267a8bdc290b086226067d6aad86830e3123"] -pygments = ["5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", "e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"] requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"] soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"] urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"] diff --git a/pyproject.toml b/pyproject.toml index 6f67ecd..2c28224 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ license = "MIT" python = "^3.5" bs4 = "^0.0.1" requests = "^2.21" -loguru = "^0.2.5" [tool.poetry.dev-dependencies] -- cgit v1.2.3 From f867b11ea01f7c13219268e9bf8be46e7eebefca Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 30 May 2019 11:26:39 +0100 Subject: Add warn method --- arxiv2remarkable.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index b3abc3c..d08efd7 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -77,6 +77,9 @@ class Provider(metaclass=abc.ABCMeta): + msg ) + def warn(self, msg): + self.log(msg, mode="warning") + @staticmethod @abc.abstractmethod def validate(src): @@ -115,15 +118,12 @@ class Provider(metaclass=abc.ABCMeta): stdout=subprocess.DEVNULL, ) if not status == 0: - self.log( - "Failed to crop the pdf file at: %s" % filepath, mode="warning" - ) + self.warn("Failed to crop the pdf file at: %s" % filepath) return filepath cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" if not os.path.exists(cropped_file): - self.log( - "Can't find cropped file '%s' where expected." % cropped_file, - mode="warning", + self.warn( + "Can't find cropped file '%s' where expected." % cropped_file ) return filepath return cropped_file @@ -145,7 +145,7 @@ class Provider(metaclass=abc.ABCMeta): ] ) if not status == 0: - self.log("Failed to shrink the pdf file", mode="warning") + self.warn("Failed to shrink the pdf file") return filepath return output_file @@ -455,10 +455,9 @@ class ACMProvider(Provider): title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): - self.log( + self.warn( "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so I can fix it: %s" % GITHUB_URL, - mode="warning", + "issue on GitHub so I can fix it: %s" % GITHUB_URL ) date = date.strip().split("/")[-1] return dict(title=title, date=date, authors=authors) -- cgit v1.2.3 From 2fae91fb5918ec2eb43e9a3956e7092f98e39c9a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 30 May 2019 11:26:48 +0100 Subject: Simplify get_page_with_retry --- arxiv2remarkable.py | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index d08efd7..08beaca 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -166,33 +166,21 @@ class Provider(metaclass=abc.ABCMeta): with open(filename, "wb") as fid: fid.write(content) - def get_page_with_retry(self, url, times=5): - """ Get the content of an url, retrying on failure. - """ - - def retry(url, count): - if count < times: - self.log( - "Caught error for url %s. Retrying in 5 seconds." % url, - mode="warning", - ) - time.sleep(5) - else: - exception("Failed to download url: %s" % url) - + def get_page_with_retry(self, url, tries=5): count = 0 - while True: + while count < tries: count += 1 + error = False try: res = requests.get(url, headers=HEADERS) except requests.exceptions.ConnectionError: - retry(url, count) + error = True + if error or not res.ok: + time.sleep(5) + self.warn("Error getting url %s. Retrying in 5 seconds" % url) continue - if res.ok: - self.log("Downloading url: %s" % url) - return res.content - else: - retry(url, count) + self.log("Downloading url: %s" % url) + return res.content def upload_to_rm(self, filepath): remarkable_dir = self.remarkable_dir.rstrip("/") -- cgit v1.2.3 From 5f7ff9883a5bdd7f1f0b0b4a2eda56f631885d9e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 30 May 2019 11:27:58 +0100 Subject: add gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ -- cgit v1.2.3