diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-01-07 14:25:50 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-01-07 14:25:50 +0000 |
| commit | f28078537d1fd48b969270b8300750bc07895cec (patch) | |
| tree | fbb8974bee99e13dec333856ca966f6eae557985 | |
| parent | Add citeseerx provider to readme (diff) | |
| download | paper2remarkable-f28078537d1fd48b969270b8300750bc07895cec.tar.gz paper2remarkable-f28078537d1fd48b969270b8300750bc07895cec.zip | |
Update ACM provider to new site structure
| -rw-r--r-- | paper2remarkable/providers/acm.py | 69 | ||||
| -rw-r--r-- | tests/test_providers.py | 11 |
2 files changed, 45 insertions, 35 deletions
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py index a0d79bd..08e8cb2 100644 --- a/paper2remarkable/providers/acm.py +++ b/paper2remarkable/providers/acm.py @@ -8,13 +8,12 @@ Copyright: 2019, G.J.J. van den Burg """ -import bs4 import re from ._base import Provider from ._info import Informer from .. import GITHUB_URL -from ..utils import exception, get_page_with_retry +from ..utils import exception from ..log import Logger logger = Logger() @@ -23,58 +22,60 @@ logger = Logger() class ACMInformer(Informer): meta_author_key = "citation_authors" + def get_title(self, soup): + target = soup.find("h1", {"class": "citation__title"}) + return target.text + + def get_authors(self, soup): + authors = [ + span.find("a").text + for span in soup.find_all("span", {"class": "auth-name"}) + ] + return self._format_authors(authors) + def _format_authors(self, soup_authors): - op = lambda x: x[0].split(";") - return super()._format_authors(soup_authors, sep=",", idx=0, op=op) + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def get_year(self, soup): + date = soup.find("span", {"class": "epub-section__date"}) + return self._format_year(date.text) def _format_year(self, soup_date): - if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): - logger.warning( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so it can be fixed: %s" % GITHUB_URL - ) - return soup_date.strip().split("/")[-1] + return soup_date.strip().split(" ")[-1].strip() class ACM(Provider): - re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" + re_abs = "^https?://dl.acm.org/doi/(?P<doi>\d+\.\d+/\d+\.\d+)" + re_pdf = "^https?://dl.acm.org/doi/pdf/(?P<doi>\d+\.\d+/\d+\.\d+)\?download=true" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = ACMInformer() - def get_acm_pdf_url(self, url): - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - thea = None - for a in soup.find_all("a"): - if a.get("name") == "FullTextPDF": - thea = a - break - if thea is None: - return None - href = thea.get("href") - if href.startswith("http"): - return href - else: - return "https://dl.acm.org/" + href + def _get_doi(self, url): + m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) + if m: + return m["doi"] + exception("Couldn't retrieve ACM publication DOI.") def get_abs_pdf_urls(self, url): if re.match(self.re_abs, url): abs_url = url - pdf_url = self.get_acm_pdf_url(url) - if pdf_url is None: - exception( - "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" - ) + doi = self._get_doi(url) + pdf_url = "https://dl.acm.org/doi/pdf/{doi}?download=true".format( + doi=doi + ) + elif re.match(self.re_pdf, url): + pdf_url = url + doi = self._get_doi(url) + abs_url = "https://dl.acm.org/doi/{doi}".format(doi=doi) else: exception( - "Couldn't figure out ACM urls, please provide a URL of the " - "format: http(s)://dl.acm.org/citation.cfm?id=..." + "Couldn't figure out ACM urls from provided url: %s" % url ) return abs_url, pdf_url def validate(src): - m = re.fullmatch(ACM.re_abs, src) + m = re.match(ACM.re_abs, src) or re.match(ACM.re_pdf, src) return not m is None diff --git a/tests/test_providers.py b/tests/test_providers.py index 75703ff..422fe0f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -88,13 +88,22 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_acm(self): + def test_acm_1(self): prov = ACM(upload=False, verbose=VERBOSE) url = "https://dl.acm.org/citation.cfm?id=3025626" exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_acm_2(self): + prov = ACM(upload=False, verbose=VERBOSE) + url = ( + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true" + ) + exp_filename = "Bateni_Esfandiari_Mirrokni_-_Optimal_Distributed_Submodular_Optimization_via_Sketching_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_openreview(self): prov = OpenReview(upload=False, verbose=VERBOSE) url = "https://openreview.net/forum?id=S1x4ghC9tQ" |
