aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--paper2remarkable/providers/acm.py69
-rw-r--r--tests/test_providers.py11
2 files changed, 45 insertions, 35 deletions
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
index a0d79bd..08e8cb2 100644
--- a/paper2remarkable/providers/acm.py
+++ b/paper2remarkable/providers/acm.py
@@ -8,13 +8,12 @@ Copyright: 2019, G.J.J. van den Burg
"""
-import bs4
import re
from ._base import Provider
from ._info import Informer
from .. import GITHUB_URL
-from ..utils import exception, get_page_with_retry
+from ..utils import exception
from ..log import Logger
logger = Logger()
@@ -23,58 +22,60 @@ logger = Logger()
class ACMInformer(Informer):
meta_author_key = "citation_authors"
+ def get_title(self, soup):
+ target = soup.find("h1", {"class": "citation__title"})
+ return target.text
+
+ def get_authors(self, soup):
+ authors = [
+ span.find("a").text
+ for span in soup.find_all("span", {"class": "auth-name"})
+ ]
+ return self._format_authors(authors)
+
def _format_authors(self, soup_authors):
- op = lambda x: x[0].split(";")
- return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+ def get_year(self, soup):
+ date = soup.find("span", {"class": "epub-section__date"})
+ return self._format_year(date.text)
def _format_year(self, soup_date):
- if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
- logger.warning(
- "Couldn't extract year from ACM page, please raise an "
- "issue on GitHub so it can be fixed: %s" % GITHUB_URL
- )
- return soup_date.strip().split("/")[-1]
+ return soup_date.strip().split(" ")[-1].strip()
class ACM(Provider):
- re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
+ re_abs = "^https?://dl.acm.org/doi/(?P<doi>\d+\.\d+/\d+\.\d+)"
+ re_pdf = "^https?://dl.acm.org/doi/pdf/(?P<doi>\d+\.\d+/\d+\.\d+)\?download=true"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.informer = ACMInformer()
- def get_acm_pdf_url(self, url):
- page = get_page_with_retry(url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- thea = None
- for a in soup.find_all("a"):
- if a.get("name") == "FullTextPDF":
- thea = a
- break
- if thea is None:
- return None
- href = thea.get("href")
- if href.startswith("http"):
- return href
- else:
- return "https://dl.acm.org/" + href
+ def _get_doi(self, url):
+ m = re.match(self.re_abs, url) or re.match(self.re_pdf, url)
+ if m:
+ return m["doi"]
+ exception("Couldn't retrieve ACM publication DOI.")
def get_abs_pdf_urls(self, url):
if re.match(self.re_abs, url):
abs_url = url
- pdf_url = self.get_acm_pdf_url(url)
- if pdf_url is None:
- exception(
- "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
- )
+ doi = self._get_doi(url)
+ pdf_url = "https://dl.acm.org/doi/pdf/{doi}?download=true".format(
+ doi=doi
+ )
+ elif re.match(self.re_pdf, url):
+ pdf_url = url
+ doi = self._get_doi(url)
+ abs_url = "https://dl.acm.org/doi/{doi}".format(doi=doi)
else:
exception(
- "Couldn't figure out ACM urls, please provide a URL of the "
- "format: http(s)://dl.acm.org/citation.cfm?id=..."
+ "Couldn't figure out ACM urls from provided url: %s" % url
)
return abs_url, pdf_url
def validate(src):
- m = re.fullmatch(ACM.re_abs, src)
+ m = re.match(ACM.re_abs, src) or re.match(ACM.re_pdf, src)
return not m is None
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 75703ff..422fe0f 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -88,13 +88,22 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
- def test_acm(self):
+ def test_acm_1(self):
prov = ACM(upload=False, verbose=VERBOSE)
url = "https://dl.acm.org/citation.cfm?id=3025626"
exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf"
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
+ def test_acm_2(self):
+ prov = ACM(upload=False, verbose=VERBOSE)
+ url = (
+ "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true"
+ )
+ exp_filename = "Bateni_Esfandiari_Mirrokni_-_Optimal_Distributed_Submodular_Optimization_via_Sketching_2018.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
def test_openreview(self):
prov = OpenReview(upload=False, verbose=VERBOSE)
url = "https://openreview.net/forum?id=S1x4ghC9tQ"