aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-10-09 14:46:59 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-10-09 14:46:59 +0100
commit9aeeb0133a447466bce46e07e5c9f0f74d02d901 (patch)
tree83a2ad1e97eda2d7317f943f65ea9cc916e3dd22
parentBugfix for sleep (diff)
downloadpaper2remarkable-9aeeb0133a447466bce46e07e5c9f0f74d02d901.tar.gz
paper2remarkable-9aeeb0133a447466bce46e07e5c9f0f74d02d901.zip
Improve string cleaning (fixes #6)
-rwxr-xr-xarxiv2remarkable.py19
1 files changed, 16 insertions, 3 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 9ab4b96..8a91fd7 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -23,6 +23,7 @@ import pdfplumber
import re
import requests
import shutil
+import string
import subprocess
import sys
import tempfile
@@ -145,20 +146,32 @@ class Provider(metaclass=abc.ABCMeta):
date = self.get_date(soup)
return dict(title=title, date=date, authors=authors)
+ def string_clean(self, s):
+ """ Clean a string to replace accented characters with equivalents and
+ keep only the allowed characters """
+ normalized = unidecode.unidecode(s)
+ allowed = string.ascii_letters + string.digits + "_ ."
+ cleaned = "".join(c if c in allowed else "_" for c in normalized)
+ return cleaned
+
def create_filename(self, info, filename=None):
""" Generate filename using the info dict or filename if provided """
if not filename is None:
return filename
# we assume that the list of authors is surname only.
self.log("Generating output filename")
+
if len(info["authors"]) > 3:
author_part = info["authors"][0] + "_et_al"
else:
author_part = "_".join(info["authors"])
- author_part = author_part.replace(" ", "_")
- title = info["title"].replace(",", "").replace(":", "")
- title_part = titlecase.titlecase(title).replace(" ", "_")
+ author_part = self.string_clean(author_part)
+
+ title_part = self.string_clean(info["title"])
+ title_part = titlecase.titlecase(title_part).replace(" ", "_")
+
year_part = info["date"].split("/")[0]
+
name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
name = unidecode.unidecode(name)
self.log("Created filename: %s" % name)