paper2remarkable/providers/arxiv.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

# -*- coding: utf-8 -*-

"""Provider for arxiv.org

Author: G.J.J. van den Burg
License: See LICENSE file
Copyright: 2019, G.J.J. van den Burg

"""

import os
import re

from ._info import Informer
from ._base import Provider
from ..exceptions import URLResolutionError
from ..log import Logger

logger = Logger()

DEARXIV_TEXT_REGEX = (
    b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}"
)


class ArxivInformer(Informer):
    pass


class Arxiv(Provider):

    re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
    re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"

    re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?"
    re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.informer = ArxivInformer()

        # register the dearxiv operation
        self.operations.insert(0, ("dearxiv", self.dearxiv))

    def get_abs_pdf_urls(self, url):
        """Get the pdf and abs url from any given arXiv url """
        if re.match(self.re_abs_1, url) or re.match(self.re_abs_2, url):
            abs_url = url
            pdf_url = url.replace("abs", "pdf") + ".pdf"
        elif re.match(self.re_pdf_1, url) or re.match(self.re_pdf_2, url):
            abs_url = url[:-4].replace("pdf", "abs")
            pdf_url = url
        else:
            raise URLResolutionError("arXiv", url)
        return abs_url, pdf_url

    def validate(src):
        """Check if the url is to an arXiv page. """
        return (
            re.match(Arxiv.re_abs_1, src)
            or re.match(Arxiv.re_pdf_1, src)
            or re.match(Arxiv.re_abs_2, src)
            or re.match(Arxiv.re_pdf_2, src)
        )

    def dearxiv(self, input_file):
        """Remove the arXiv timestamp from a pdf"""
        logger.info("Removing arXiv timestamp")
        basename = os.path.splitext(input_file)[0]

        uncompress_file = basename + "_uncompress.pdf"
        self.uncompress_pdf(input_file, uncompress_file)

        with open(uncompress_file, "rb") as fid:
            data = fid.read()
            # Remove the text element
            data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data)
            # Remove the URL element
            data = re.sub(
                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
                b"",
                data,
            )

        removed_file = basename + "_removed.pdf"
        with open(removed_file, "wb") as oid:
            oid.write(data)

        output_file = basename + "_dearxiv.pdf"
        self.compress_pdf(removed_file, output_file)

        return output_file