1 files changed, 16 insertions, 3 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 9ab4b96..8a91fd7 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -23,6 +23,7 @@ import pdfplumber
 import re
 import requests
 import shutil
+import string
 import subprocess
 import sys
 import tempfile
@@ -145,20 +146,32 @@ class Provider(metaclass=abc.ABCMeta):
         date = self.get_date(soup)
         return dict(title=title, date=date, authors=authors)
 
+    def string_clean(self, s):
+        """ Clean a string to replace accented characters with equivalents and 
+        keep only the allowed characters """
+        normalized = unidecode.unidecode(s)
+        allowed = string.ascii_letters + string.digits + "_ ."
+        cleaned = "".join(c if c in allowed else "_" for c in normalized)
+        return cleaned
+
     def create_filename(self, info, filename=None):
         """ Generate filename using the info dict or filename if provided """
         if not filename is None:
             return filename
         # we assume that the list of authors is surname only.
         self.log("Generating output filename")
+
         if len(info["authors"]) > 3:
             author_part = info["authors"][0] + "_et_al"
         else:
             author_part = "_".join(info["authors"])
-        author_part = author_part.replace(" ", "_")
-        title = info["title"].replace(",", "").replace(":", "")
-        title_part = titlecase.titlecase(title).replace(" ", "_")
+        author_part = self.string_clean(author_part)
+
+        title_part = self.string_clean(info["title"])
+        title_part = titlecase.titlecase(title_part).replace(" ", "_")
+
         year_part = info["date"].split("/")[0]
+
         name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
         name = unidecode.unidecode(name)
         self.log("Created filename: %s" % name)