aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md12
-rwxr-xr-xarxiv2remarkable.py75
2 files changed, 74 insertions, 13 deletions
diff --git a/README.md b/README.md
index c8ff0fe..04712df 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@ This script makes it as easy to get a PDF on your reMarkable from any of the
following sources:
- an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``)
+- a PubMed Central url (either to the HTML or the PDF)
- a url to a PDF file
- a local file.
@@ -82,15 +83,16 @@ with the relevant options to the script.
The script also needs the following Python packages:
-- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/)
-- [requests](https://pypi.org/project/requests/)
-- [loguru](https://pypi.org/project/loguru/)
-- [PyPDF2](https://github.com/mstamy2/PyPDF2)
+- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML
+- [requests](https://pypi.org/project/requests/): getting HTML
+- [loguru](https://pypi.org/project/loguru/): easy logging
+- [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF
+- [titlecase](https://pypi.org/project/titlecase/): fancy titles
You can use this line:
```bash
-pip install --user bs4 requests loguru PyPDF2
+pip install --user bs4 requests loguru PyPDF2 titlecase
```
# Notes
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index b44f80f..ea8b4c2 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -29,6 +29,7 @@ import subprocess
import sys
import tempfile
import time
+import titlecase
import urllib.parse
from loguru import logger
@@ -70,6 +71,11 @@ def arxiv_url(url):
return not m is None
+def pmc_url(url):
+ m = re.fullmatch("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url)
+ return not m is None
+
+
def valid_url(url):
try:
result = urllib.parse.urlparse(url)
@@ -87,7 +93,7 @@ def check_file_is_pdf(filename):
def get_arxiv_urls(url):
- """Get the pdf and abs url from any given url """
+ """Get the pdf and abs url from any given arXiv url """
if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
abs_url = url
pdf_url = url.replace("abs", "pdf") + ".pdf"
@@ -99,6 +105,23 @@ def get_arxiv_urls(url):
return pdf_url, abs_url
+def get_pmc_urls(url):
+ """Get the pdf and html url from a given PMC url """
+ if re.match(
+ "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf",
+ url,
+ ):
+ idx = url.index("pdf")
+ abs_url = url[: idx - 1]
+ pdf_url = url
+ elif re.match("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url):
+ abs_url = url
+ pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually
+ else:
+ exception("Couldn't figure out PMC urls.")
+ return pdf_url, abs_url
+
+
def get_page_with_retry(url):
"""Get the content of an url, retrying up to five times on failure. """
count = 0
@@ -206,27 +229,51 @@ def shrink_pdf(filepath, gs_path="gs"):
return output_file
-def get_paper_info(url):
+def get_paper_info_arxiv(url):
logger.info("Getting paper info from arXiv")
page = get_page_with_retry(url)
soup = bs4.BeautifulSoup(page, "html.parser")
authors = [
+ x["content"] for x in soup.find_all("meta", {"name": "citation_author"})
+ ]
+ authors = [x.split(",")[0].strip() for x in authors]
+ title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+ date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+ return dict(title=title, date=date, authors=authors)
+
+
+def get_paper_info_pmc(url):
+ """ Extract the paper's authors, title, and publication year """
+ logger.info("Getting paper info from PMC")
+ page = get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ authors = [
x["content"]
- for x in soup.find_all("meta", {"name": "citation_author"})
+ for x in soup.find_all("meta", {"name": "citation_authors"})
]
+ # We only use last names, and this method is a guess at best. I'm open to
+ # more advanced approaches.
+ authors = [x.strip().split(" ")[-1].strip() for x in authors[0].split(",")]
title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+ if re.match("\w+\ \d{4}", date):
+ date = date.split(" ")[-1]
+ else:
+ date = date.replace(" ", "_")
return dict(title=title, date=date, authors=authors)
def generate_filename(info):
+ """ Generate a nice filename for a paper given the info dict """
+ # we assume that the list of authors is lastname only.
logger.info("Generating output filename")
if len(info["authors"]) > 3:
- author_part = info["authors"][0].split(",")[0] + "_et_al"
+ author_part = info["authors"][0] + "_et_al"
else:
- author_part = "_".join([x.split(",")[0] for x in info["authors"]])
+ author_part = "_".join(info["authors"])
author_part = author_part.replace(" ", "_")
- title_part = info["title"].replace(",", "").replace(" ", "_")
+ title = info["title"].replace(",", "").replace(":", "").replace(" ", "_")
+ title_part = titlecase.titlecase(title)
year_part = info["date"].split("/")[0]
return author_part + "_-_" + title_part + "_" + year_part + ".pdf"
@@ -274,7 +321,7 @@ def parse_args():
"--filename",
help="Filename to use for the file on reMarkable",
default=None,
- )
+ )
parser.add_argument(
"-p",
"--remarkable-path",
@@ -306,6 +353,8 @@ def main():
mode = "local_file"
elif arxiv_url(args.input):
mode = "arxiv_url"
+ elif pmc_url(args.input):
+ mode = "pmc_url"
elif valid_url(args.input):
if args.filename is None:
exception("Filename must be provided with pdf url (use --filename)")
@@ -332,7 +381,17 @@ def main():
if args.filename:
clean_filename = args.filename
else:
- paper_info = get_paper_info(abs_url)
+ paper_info = get_paper_info_arxiv(abs_url)
+ clean_filename = generate_filename(paper_info)
+
+ if mode == "pmc_url":
+ pdf_url, abs_url = get_pmc_urls(args.input)
+ filename = "paper.pdf"
+ download_url(pdf_url, filename)
+ if args.filename:
+ clean_filename = args.filename
+ else:
+ paper_info = get_paper_info_pmc(abs_url)
clean_filename = generate_filename(paper_info)
if mode == "pdf_url":