diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-08-19 17:42:47 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-08-19 17:42:47 +0100 |
| commit | 5965e2d32538defe6c79edabe177747ca2c114bc (patch) | |
| tree | a4cdcf650e835b851fa550f36b644e828afa9ad6 | |
| parent | Simplify provider names (diff) | |
| download | paper2remarkable-5965e2d32538defe6c79edabe177747ca2c114bc.tar.gz paper2remarkable-5965e2d32538defe6c79edabe177747ca2c114bc.zip | |
Add provider for Springer papers
| -rwxr-xr-x | arxiv2remarkable.py | 53 |
1 files changed, 45 insertions, 8 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 8f605ab..bf999fb 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -548,6 +548,50 @@ class OpenReview(Provider): return dict(title=title, date=date, authors=authors) +class Springer(Provider): + + re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a Springer url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_pdf, url): + abs_url = url.replace("content/pdf", "article") + pdf_url = url + else: + exception("Couldn't figure out Springer urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) + + def retrieve_pdf(self, src, filename): + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def get_paper_info(self, src): + abs_url, _ = self.get_abs_pdf_urls(src) + self.log("Getting paper info from Springer") + page = self.get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_author"}) + ] + authors = [x.split(" ")[-1].strip() for x in authors] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_online_date"})[0][ + "content" + ] + return dict(title=title, date=date, authors=authors) + + class LocalFile(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -804,14 +848,7 @@ def parse_args(): def main(): args = parse_args() - providers = [ - Arxiv, - Pubmed, - ACM, - OpenReview, - LocalFile, - PdfUrl, - ] + providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl] provider = next((p for p in providers if p.validate(args.input)), None) if provider is None: |
