aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--paper2remarkable/providers/_base.py15
-rw-r--r--paper2remarkable/utils.py19
2 files changed, 19 insertions, 15 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 4354776..db13434 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -13,17 +13,17 @@ import bs4
import logging
import os
import shutil
-import string
import tempfile
import titlecase
import unidecode
from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
from ..utils import (
- upload_to_remarkable,
check_file_is_pdf,
+ clean_string,
download_url,
get_page_with_retry,
+ upload_to_remarkable,
)
@@ -131,13 +131,6 @@ class Provider(metaclass=abc.ABCMeta):
date = self.get_date(soup)
return dict(title=title, date=date, authors=authors)
- def string_clean(self, s):
- """ Clean a string to replace accented characters with equivalents and
- keep only the allowed characters """
- normalized = unidecode.unidecode(s)
- allowed = string.ascii_letters + string.digits + "_ ."
- cleaned = "".join(c if c in allowed else "_" for c in normalized)
- return cleaned
def create_filename(self, info, filename=None):
""" Generate filename using the info dict or filename if provided """
@@ -150,9 +143,9 @@ class Provider(metaclass=abc.ABCMeta):
author_part = info["authors"][0] + "_et_al"
else:
author_part = "_".join(info["authors"])
- author_part = self.string_clean(author_part)
+ author_part = clean_string(author_part)
- title_part = self.string_clean(info["title"])
+ title_part = clean_string(info["title"])
title_part = titlecase.titlecase(title_part).replace(" ", "_")
year_part = info["date"].split("/")[0]
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index e2a714b..15cac95 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -8,13 +8,14 @@ Copyright: 2019, G.J.J. van den Burg
"""
-
import PyPDF2
import logging
+import requests
+import string
import subprocess
import sys
-import requests
import time
+import unidecode
GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
@@ -36,6 +37,16 @@ def exception(msg):
raise SystemExit(1)
+def clean_string(s):
+ """ Clean a string by replacing accented characters with equivalents and
+ keeping only the allowed characters (ascii letters, digits, underscore,
+ space, and period)"""
+ normalized = unidecode.unidecode(s)
+ allowed = string.ascii_letters + string.digits + "_ ."
+ cleaned = "".join(c if c in allowed else "_" for c in normalized)
+ return cleaned
+
+
def check_file_is_pdf(filename):
"""Check that a given file is a PDF file.
@@ -70,8 +81,8 @@ def get_page_with_retry(url, tries=5):
error = True
if error or not res.ok:
logging.warning(
- "(%i/%i) Error getting url %s. Retrying in 5 seconds." %
- (count, tries, url)
+ "(%i/%i) Error getting url %s. Retrying in 5 seconds."
+ % (count, tries, url)
)
time.sleep(5)
continue