tangled
alpha
login
or
join now
garrison.tngl.sh
/
anyscraper
0
fork
atom
an async framework for scraping and crawling the web
0
fork
atom
overview
issues
pulls
pipelines
example crawler
garrison.tngl.sh
6 months ago
c09de95a
d8b7eaf3
+121
-16
4 changed files
expand all
collapse all
unified
split
anyscraper
crawler.py
scrape.py
url.py
crawl.py
+2
-1
anyscraper/crawler.py
···
31
31
status: int
32
32
message: str
33
33
34
34
+
34
35
@dataclass
35
36
class FetchSetTempDelay:
36
37
retry_after: float
···
50
51
fetch_log.warn(f"status {response.status} at {url}")
51
52
if response.status == 429:
52
53
# too many retries
53
53
-
retry_after = float(response.headers['retry-after'])
54
54
+
retry_after = float(response.headers["retry-after"])
54
55
return FetchSetTempDelay(retry_after=retry_after)
55
56
elif response.status != 200:
56
57
text = await response.text()
+14
-10
anyscraper/scrape.py
···
14
14
log_anchors = Logger("scrape-anchors")
15
15
log_sitemap = Logger("scrape-sitemap")
16
16
17
17
+
17
18
@dataclass
18
19
class HtmlScrape:
19
20
rel_links: List[Url]
20
21
anchor_hrefs: List[Url]
21
22
23
23
+
22
24
def is_lang(tag, lang):
23
23
-
actual = tag.get('lang')
25
25
+
actual = tag.get("lang")
24
26
return actual is None or actual == lang
27
27
+
25
28
26
29
def html(text: str, domain: Domain, lang="en") -> HtmlScrape:
27
30
"""
28
31
parses common link types from raw html
29
32
domain is used for relative links
30
33
"""
31
31
-
soup = BeautifulSoup(text, 'lxml')
34
34
+
soup = BeautifulSoup(text, "lxml")
32
35
33
36
rel_links = []
34
34
-
for link_tag in soup.select('link[rel][href]'):
37
37
+
for link_tag in soup.select("link[rel][href]"):
35
38
if not is_lang(link_tag, lang):
36
39
continue
37
40
38
38
-
if link_tag.get('rel') == "next":
39
39
-
url = Url.from_quoted_str(cast(str, link_tag.get('href')), domain)
41
41
+
if link_tag.get("rel") == "next":
42
42
+
url = Url.from_quoted_str(cast(str, link_tag.get("href")), domain)
40
43
rel_links.append(url)
41
44
42
45
anchor_hrefs = []
43
43
-
for a_tag in soup.select('a[href]'):
46
46
+
for a_tag in soup.select("a[href]"):
44
47
if not is_lang(a_tag, lang):
45
48
continue
46
49
47
47
-
url = Url.from_quoted_str(cast(str, a_tag.get('href')), domain)
50
50
+
url = Url.from_quoted_str(cast(str, a_tag.get("href")), domain)
48
51
anchor_hrefs.append(url)
49
52
50
53
return HtmlScrape(rel_links=rel_links, anchor_hrefs=anchor_hrefs)
51
54
55
55
+
52
56
# TODO support multiple ways of discovering sitemap: https://www.standard-sitemap.org/deployment.php#linking
53
57
54
58
SITEMAP_SCHEMA_HTTP_SITEMAPS_0_9 = Url.from_str(
···
62
66
def _parse_sitemap_0_9(soup: BeautifulSoup, lang: str) -> Iterator[Tuple[str, str]]:
63
67
"""yields ("sitemap" | "url", url_string)"""
64
68
for sitemap in soup.find_all("sitemap"):
65
65
-
yield ("sitemap", sitemap.loc.string) # type: ignore
69
69
+
yield ("sitemap", sitemap.loc.string) # type: ignore
66
70
67
71
for urlset_url in soup.find_all("url"):
68
68
-
yield ("url", urlset_url.loc.string) # type: ignore
72
72
+
yield ("url", urlset_url.loc.string) # type: ignore
69
73
70
74
71
75
# TODO this should probably be redesigned to allow threaded handling of stupid large sitemaps
···
108
112
# check sitemap xmlns
109
113
xmlns = SITEMAP_SCHEMA_HTTP_SITEMAPS_0_9
110
114
111
111
-
xmlns_el = soup.find(attrs={'name':'xmlns'})
115
115
+
xmlns_el = soup.find(attrs={"name": "xmlns"})
112
116
if xmlns_el is not None:
113
117
assert xmlns_el is PageElement
114
118
xmlns = Url.from_str(xmlns_el.get("xmlns"))
+7
-5
anyscraper/url.py
···
62
62
class Domain:
63
63
def __init__(self, url: str):
64
64
res = urlparse(url)
65
65
-
scheme = "http" if res.scheme == '' else res.scheme
65
65
+
scheme = "http" if res.scheme == "" else res.scheme
66
66
self.clean = f"{scheme}://{res.netloc}/"
67
67
68
68
def __eq__(self, other):
···
88
88
route: str
89
89
90
90
@classmethod
91
91
-
def from_str(cls, url_str: str, domain: Optional[Domain]=None):
91
91
+
def from_str(cls, url_str: str, domain: Optional[Domain] = None):
92
92
"""parses url"""
93
93
res = urlparse(url_str)
94
94
95
95
-
if res.netloc != '':
95
95
+
if res.netloc != "":
96
96
domain = Domain(url_str)
97
97
98
98
route = res.path
···
103
103
return Url(domain, route)
104
104
105
105
@classmethod
106
106
-
def from_quoted_str(cls, url_str: str, domain: Optional[Domain]=None):
106
106
+
def from_quoted_str(cls, url_str: str, domain: Optional[Domain] = None):
107
107
"""parses url that may contain %XX quotes"""
108
108
return Url.from_str(unquote_urllike(url_str), domain=domain)
109
109
110
110
def matches(self, other) -> bool:
111
111
"""check if this url matches other, assuming other.route is a rule"""
112
112
assert isinstance(other, Url)
113
113
-
return self.domain == other.domain \
113
113
+
return (
114
114
+
self.domain == other.domain
114
115
and match_rule(other.route, self.route) is not None
116
116
+
)
115
117
116
118
def __eq__(self, other):
117
119
return (
+98
crawl.py
···
1
1
+
#!/usr/bin/env -S uv run
2
2
+
from typing import Optional, List
3
3
+
import asyncio
4
4
+
from anyscraper import Crawler, CrawlResult, Url, scrape
5
5
+
from argparse import ArgumentParser
6
6
+
import itertools
7
7
+
8
8
+
SEEN_URLS = set()
9
9
+
10
10
+
11
11
+
def should_crawl(url: Url, whitelist: Optional[List[Url]]) -> bool:
12
12
+
if url in SEEN_URLS:
13
13
+
return False
14
14
+
15
15
+
if whitelist is None:
16
16
+
return True
17
17
+
18
18
+
for w in whitelist:
19
19
+
if url.matches(w):
20
20
+
return True
21
21
+
22
22
+
return False
23
23
+
24
24
+
25
25
+
async def crawlback(c: Crawler, res: CrawlResult, whitelist: Optional[List[Url]]):
26
26
+
match res.mime.split(";")[0]:
27
27
+
case "text/html":
28
28
+
urls = scrape.html(res.text, res.url.domain)
29
29
+
for url in itertools.chain(urls.rel_links, urls.anchor_hrefs):
30
30
+
if not should_crawl(url, whitelist):
31
31
+
continue
32
32
+
33
33
+
SEEN_URLS.add(url)
34
34
+
await c.add_url(url, res.url)
35
35
+
case unk:
36
36
+
print(f"unhandled mime type: {unk}")
37
37
+
38
38
+
39
39
+
def parse_args():
40
40
+
parser = ArgumentParser(
41
41
+
description="an example crawler that identifies links for most websites",
42
42
+
)
43
43
+
parser.add_argument(
44
44
+
"-w", "--whitelist", action="append", help="whitelist this domain + route glob"
45
45
+
)
46
46
+
parser.add_argument(
47
47
+
"-H",
48
48
+
"--hostile",
49
49
+
action="append",
50
50
+
help="ignore robots.txt for this domain + route glob",
51
51
+
)
52
52
+
parser.add_argument("seeds", nargs="+", help="seed urls to start from")
53
53
+
54
54
+
return parser.parse_args()
55
55
+
56
56
+
57
57
+
async def run_crawl(args):
58
58
+
whitelist = None
59
59
+
if args.whitelist is not None:
60
60
+
whitelist = list(map(Url.from_quoted_str, args.whitelist))
61
61
+
62
62
+
c = Crawler(
63
63
+
"test-crawler",
64
64
+
crawlback=crawlback,
65
65
+
crawlback_kwargs={"whitelist": whitelist},
66
66
+
)
67
67
+
68
68
+
if args.hostile is not None:
69
69
+
allow_rules = {}
70
70
+
71
71
+
for hostile_url in args.hostile:
72
72
+
url = Url.from_quoted_str(hostile_url)
73
73
+
if url.domain not in allow_rules:
74
74
+
allow_rules[url.domain] = []
75
75
+
allow_rules[url.domain].append(url.route)
76
76
+
77
77
+
for domain, rules in allow_rules.items():
78
78
+
robots_txt = """
79
79
+
user-agent: *
80
80
+
disallow:
81
81
+
""" + "\n".join(
82
82
+
f"allow: {rule}" for rule in rules
83
83
+
)
84
84
+
85
85
+
await c.configure_domain(domain, robots_txt)
86
86
+
87
87
+
for seed in args.seeds:
88
88
+
await c.add_url(Url.from_quoted_str(seed))
89
89
+
await c.start()
90
90
+
91
91
+
92
92
+
def main():
93
93
+
args = parse_args()
94
94
+
asyncio.run(run_crawl(args))
95
95
+
96
96
+
97
97
+
if __name__ == "__main__":
98
98
+
main()