diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-10-09 14:46:59 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-10-09 14:46:59 +0100 |
| commit | 9aeeb0133a447466bce46e07e5c9f0f74d02d901 (patch) | |
| tree | 83a2ad1e97eda2d7317f943f65ea9cc916e3dd22 /arxiv2remarkable.py | |
| parent | Bugfix for sleep (diff) | |
| download | paper2remarkable-9aeeb0133a447466bce46e07e5c9f0f74d02d901.tar.gz paper2remarkable-9aeeb0133a447466bce46e07e5c9f0f74d02d901.zip | |
Improve string cleaning (fixes #6)
Diffstat (limited to 'arxiv2remarkable.py')
| -rwxr-xr-x | arxiv2remarkable.py | 19 |
1 files changed, 16 insertions, 3 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 9ab4b96..8a91fd7 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -23,6 +23,7 @@ import pdfplumber import re import requests import shutil +import string import subprocess import sys import tempfile @@ -145,20 +146,32 @@ class Provider(metaclass=abc.ABCMeta): date = self.get_date(soup) return dict(title=title, date=date, authors=authors) + def string_clean(self, s): + """ Clean a string to replace accented characters with equivalents and + keep only the allowed characters """ + normalized = unidecode.unidecode(s) + allowed = string.ascii_letters + string.digits + "_ ." + cleaned = "".join(c if c in allowed else "_" for c in normalized) + return cleaned + def create_filename(self, info, filename=None): """ Generate filename using the info dict or filename if provided """ if not filename is None: return filename # we assume that the list of authors is surname only. self.log("Generating output filename") + if len(info["authors"]) > 3: author_part = info["authors"][0] + "_et_al" else: author_part = "_".join(info["authors"]) - author_part = author_part.replace(" ", "_") - title = info["title"].replace(",", "").replace(":", "") - title_part = titlecase.titlecase(title).replace(" ", "_") + author_part = self.string_clean(author_part) + + title_part = self.string_clean(info["title"]) + title_part = titlecase.titlecase(title_part).replace(" ", "_") + year_part = info["date"].split("/")[0] + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" name = unidecode.unidecode(name) self.log("Created filename: %s" % name) |
