diff options
| -rwxr-xr-x | arxiv2remarkable.py | 19 |
1 files changed, 16 insertions, 3 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 9ab4b96..8a91fd7 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -23,6 +23,7 @@ import pdfplumber import re import requests import shutil +import string import subprocess import sys import tempfile @@ -145,20 +146,32 @@ class Provider(metaclass=abc.ABCMeta): date = self.get_date(soup) return dict(title=title, date=date, authors=authors) + def string_clean(self, s): + """ Clean a string to replace accented characters with equivalents and + keep only the allowed characters """ + normalized = unidecode.unidecode(s) + allowed = string.ascii_letters + string.digits + "_ ." + cleaned = "".join(c if c in allowed else "_" for c in normalized) + return cleaned + def create_filename(self, info, filename=None): """ Generate filename using the info dict or filename if provided """ if not filename is None: return filename # we assume that the list of authors is surname only. self.log("Generating output filename") + if len(info["authors"]) > 3: author_part = info["authors"][0] + "_et_al" else: author_part = "_".join(info["authors"]) - author_part = author_part.replace(" ", "_") - title = info["title"].replace(",", "").replace(":", "") - title_part = titlecase.titlecase(title).replace(" ", "_") + author_part = self.string_clean(author_part) + + title_part = self.string_clean(info["title"]) + title_part = titlecase.titlecase(title_part).replace(" ", "_") + year_part = info["date"].split("/")[0] + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" name = unidecode.unidecode(name) self.log("Created filename: %s" % name) |
