diff options
Diffstat (limited to 'paper2remarkable/providers/_base.py')
| -rw-r--r-- | paper2remarkable/providers/_base.py | 44 |
1 files changed, 8 insertions, 36 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index f703874..4354776 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -12,22 +12,19 @@ import abc import bs4 import logging import os -import requests import shutil import string import tempfile -import time import titlecase import unidecode from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf -from ..utils import upload_to_remarkable, check_file_is_pdf - -HEADERS = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " - "Safari/537.36" -} +from ..utils import ( + upload_to_remarkable, + check_file_is_pdf, + download_url, + get_page_with_retry, +) class Provider(metaclass=abc.ABCMeta): @@ -90,7 +87,7 @@ class Provider(metaclass=abc.ABCMeta): def retrieve_pdf(self, src, filename): """ Download pdf from src and save to filename """ _, pdf_url = self.get_abs_pdf_urls(src) - self.download_url(pdf_url, filename) + download_url(pdf_url, filename) def _format_authors(self, soup_authors, sep=",", idx=0, op=None): op = (lambda x: x) if op is None else op @@ -127,7 +124,7 @@ class Provider(metaclass=abc.ABCMeta): """ Retrieve the title/author (surnames)/year information """ abs_url, _ = self.get_abs_pdf_urls(src) logging.info("Getting paper info") - page = self.get_page_with_retry(abs_url) + page = get_page_with_retry(abs_url) soup = bs4.BeautifulSoup(page, "html.parser") authors = self.get_authors(soup) title = self.get_title(soup) @@ -165,31 +162,6 @@ class Provider(metaclass=abc.ABCMeta): logging.info("Created filename: %s" % name) return name - def download_url(self, url, filename): - """Download the content of an url and save it to a filename """ - logging.info("Downloading file at url: %s" % url) - content = self.get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) - - def get_page_with_retry(self, url, tries=5): - count = 0 - while count < tries: - count += 1 - error = False - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - error = True - if error or not res.ok: - logging.warning( - "Error getting url %s. Retrying in 5 seconds" % url - ) - time.sleep(5) - continue - logging.info("Downloading url: %s" % url) - return res.content - def run(self, src, filename=None): info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) |
