diff options
| -rw-r--r-- | paper2remarkable/providers/_base.py | 15 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 19 |
2 files changed, 19 insertions, 15 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 4354776..db13434 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -13,17 +13,17 @@ import bs4 import logging import os import shutil -import string import tempfile import titlecase import unidecode from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf from ..utils import ( - upload_to_remarkable, check_file_is_pdf, + clean_string, download_url, get_page_with_retry, + upload_to_remarkable, ) @@ -131,13 +131,6 @@ class Provider(metaclass=abc.ABCMeta): date = self.get_date(soup) return dict(title=title, date=date, authors=authors) - def string_clean(self, s): - """ Clean a string to replace accented characters with equivalents and - keep only the allowed characters """ - normalized = unidecode.unidecode(s) - allowed = string.ascii_letters + string.digits + "_ ." - cleaned = "".join(c if c in allowed else "_" for c in normalized) - return cleaned def create_filename(self, info, filename=None): """ Generate filename using the info dict or filename if provided """ @@ -150,9 +143,9 @@ class Provider(metaclass=abc.ABCMeta): author_part = info["authors"][0] + "_et_al" else: author_part = "_".join(info["authors"]) - author_part = self.string_clean(author_part) + author_part = clean_string(author_part) - title_part = self.string_clean(info["title"]) + title_part = clean_string(info["title"]) title_part = titlecase.titlecase(title_part).replace(" ", "_") year_part = info["date"].split("/")[0] diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index e2a714b..15cac95 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -8,13 +8,14 @@ Copyright: 2019, G.J.J. van den Burg """ - import PyPDF2 import logging +import requests +import string import subprocess import sys -import requests import time +import unidecode GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" @@ -36,6 +37,16 @@ def exception(msg): raise SystemExit(1) +def clean_string(s): + """ Clean a string by replacing accented characters with equivalents and + keeping only the allowed characters (ascii letters, digits, underscore, + space, and period)""" + normalized = unidecode.unidecode(s) + allowed = string.ascii_letters + string.digits + "_ ." + cleaned = "".join(c if c in allowed else "_" for c in normalized) + return cleaned + + def check_file_is_pdf(filename): """Check that a given file is a PDF file. @@ -70,8 +81,8 @@ def get_page_with_retry(url, tries=5): error = True if error or not res.ok: logging.warning( - "(%i/%i) Error getting url %s. Retrying in 5 seconds." % - (count, tries, url) + "(%i/%i) Error getting url %s. Retrying in 5 seconds." + % (count, tries, url) ) time.sleep(5) continue |
