Move download functionality to utils

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-10-24 15:01:32 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-10-24 15:01:32 +0100
commit: 7551591bf876f005c47a5fe98618e0ec6e2412d2 (patch)
tree: b2e074c89ffdfecd3dec7230d9c165bf78da98fb
parent: Minor fixes to check_file_is_pdf (diff)
download: paper2remarkable-7551591bf876f005c47a5fe98618e0ec6e2412d2.tar.gz
paper2remarkable-7551591bf876f005c47a5fe98618e0ec6e2412d2.zip
2 files changed, 44 insertions, 36 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index f703874..4354776 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -12,22 +12,19 @@ import abc
 import bs4
 import logging
 import os
-import requests
 import shutil
 import string
 import tempfile
-import time
 import titlecase
 import unidecode
 
 from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
-from ..utils import upload_to_remarkable, check_file_is_pdf
-
-HEADERS = {
-    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
-    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
-    "Safari/537.36"
-}
+from ..utils import (
+    upload_to_remarkable,
+    check_file_is_pdf,
+    download_url,
+    get_page_with_retry,
+)
 
 
 class Provider(metaclass=abc.ABCMeta):
@@ -90,7 +87,7 @@ class Provider(metaclass=abc.ABCMeta):
     def retrieve_pdf(self, src, filename):
         """ Download pdf from src and save to filename """
         _, pdf_url = self.get_abs_pdf_urls(src)
-        self.download_url(pdf_url, filename)
+        download_url(pdf_url, filename)
 
     def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
         op = (lambda x: x) if op is None else op
@@ -127,7 +124,7 @@ class Provider(metaclass=abc.ABCMeta):
         """ Retrieve the title/author (surnames)/year information """
         abs_url, _ = self.get_abs_pdf_urls(src)
         logging.info("Getting paper info")
-        page = self.get_page_with_retry(abs_url)
+        page = get_page_with_retry(abs_url)
         soup = bs4.BeautifulSoup(page, "html.parser")
         authors = self.get_authors(soup)
         title = self.get_title(soup)
@@ -165,31 +162,6 @@ class Provider(metaclass=abc.ABCMeta):
         logging.info("Created filename: %s" % name)
         return name
 
-    def download_url(self, url, filename):
-        """Download the content of an url and save it to a filename """
-        logging.info("Downloading file at url: %s" % url)
-        content = self.get_page_with_retry(url)
-        with open(filename, "wb") as fid:
-            fid.write(content)
-
-    def get_page_with_retry(self, url, tries=5):
-        count = 0
-        while count < tries:
-            count += 1
-            error = False
-            try:
-                res = requests.get(url, headers=HEADERS)
-            except requests.exceptions.ConnectionError:
-                error = True
-            if error or not res.ok:
-                logging.warning(
-                    "Error getting url %s. Retrying in 5 seconds" % url
-                )
-                time.sleep(5)
-                continue
-            logging.info("Downloading url: %s" % url)
-            return res.content
-
     def run(self, src, filename=None):
         info = self.get_paper_info(src)
         clean_filename = self.create_filename(info, filename)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 110453b..e2a714b 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -13,9 +13,17 @@ import PyPDF2
 import logging
 import subprocess
 import sys
+import requests
+import time
 
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
 
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
+    "Safari/537.36"
+}
+
 
 def exception(msg):
     print("ERROR: " + msg, file=sys.stderr)
@@ -43,6 +51,34 @@ def check_file_is_pdf(filename):
         exception("File %s isn't a valid pdf file." % filename)
 
 
+def download_url(url, filename):
+    """Download the content of an url and save it to a filename """
+    logging.info("Downloading file at url: %s" % url)
+    content = get_page_with_retry(url)
+    with open(filename, "wb") as fid:
+        fid.write(content)
+
+
+def get_page_with_retry(url, tries=5):
+    count = 0
+    while count < tries:
+        count += 1
+        error = False
+        try:
+            res = requests.get(url, headers=HEADERS)
+        except requests.exceptions.ConnectionError:
+            error = True
+        if error or not res.ok:
+            logging.warning(
+                "(%i/%i) Error getting url %s. Retrying in 5 seconds." % 
+                (count, tries, url)
+            )
+            time.sleep(5)
+            continue
+        logging.info("Downloading url: %s" % url)
+        return res.content
+
+
 def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
     logging.info("Starting upload to reMarkable")
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-10-24 15:01:32 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-10-24 15:01:32 +0100
commit	7551591bf876f005c47a5fe98618e0ec6e2412d2 (patch)
tree	b2e074c89ffdfecd3dec7230d9c165bf78da98fb
parent	Minor fixes to check_file_is_pdf (diff)
download	paper2remarkable-7551591bf876f005c47a5fe98618e0ec6e2412d2.tar.gz paper2remarkable-7551591bf876f005c47a5fe98618e0ec6e2412d2.zip