From 7551591bf876f005c47a5fe98618e0ec6e2412d2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 15:01:32 +0100 Subject: Move download functionality to utils --- paper2remarkable/providers/_base.py | 44 +++++++------------------------------ paper2remarkable/utils.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index f703874..4354776 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -12,22 +12,19 @@ import abc import bs4 import logging import os -import requests import shutil import string import tempfile -import time import titlecase import unidecode from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf -from ..utils import upload_to_remarkable, check_file_is_pdf - -HEADERS = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " - "Safari/537.36" -} +from ..utils import ( + upload_to_remarkable, + check_file_is_pdf, + download_url, + get_page_with_retry, +) class Provider(metaclass=abc.ABCMeta): @@ -90,7 +87,7 @@ class Provider(metaclass=abc.ABCMeta): def retrieve_pdf(self, src, filename): """ Download pdf from src and save to filename """ _, pdf_url = self.get_abs_pdf_urls(src) - self.download_url(pdf_url, filename) + download_url(pdf_url, filename) def _format_authors(self, soup_authors, sep=",", idx=0, op=None): op = (lambda x: x) if op is None else op @@ -127,7 +124,7 @@ class Provider(metaclass=abc.ABCMeta): """ Retrieve the title/author (surnames)/year information """ abs_url, _ = self.get_abs_pdf_urls(src) logging.info("Getting paper info") - page = self.get_page_with_retry(abs_url) + page = get_page_with_retry(abs_url) soup = bs4.BeautifulSoup(page, "html.parser") authors = self.get_authors(soup) title = self.get_title(soup) @@ -165,31 +162,6 @@ class Provider(metaclass=abc.ABCMeta): logging.info("Created filename: %s" % name) return name - def download_url(self, url, filename): - """Download the content of an url and save it to a filename """ - logging.info("Downloading file at url: %s" % url) - content = self.get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) - - def get_page_with_retry(self, url, tries=5): - count = 0 - while count < tries: - count += 1 - error = False - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - error = True - if error or not res.ok: - logging.warning( - "Error getting url %s. Retrying in 5 seconds" % url - ) - time.sleep(5) - continue - logging.info("Downloading url: %s" % url) - return res.content - def run(self, src, filename=None): info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 110453b..e2a714b 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -13,9 +13,17 @@ import PyPDF2 import logging import subprocess import sys +import requests +import time GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " + "Safari/537.36" +} + def exception(msg): print("ERROR: " + msg, file=sys.stderr) @@ -43,6 +51,34 @@ def check_file_is_pdf(filename): exception("File %s isn't a valid pdf file." % filename) +def download_url(url, filename): + """Download the content of an url and save it to a filename """ + logging.info("Downloading file at url: %s" % url) + content = get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + +def get_page_with_retry(url, tries=5): + count = 0 + while count < tries: + count += 1 + error = False + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logging.warning( + "(%i/%i) Error getting url %s. Retrying in 5 seconds." % + (count, tries, url) + ) + time.sleep(5) + continue + logging.info("Downloading url: %s" % url) + return res.content + + def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): logging.info("Starting upload to reMarkable") -- cgit v1.2.3