an async framework for scraping and crawling the web

example crawler

+121 -16
+2 -1
anyscraper/crawler.py
··· 31 31 status: int 32 32 message: str 33 33 34 + 34 35 @dataclass 35 36 class FetchSetTempDelay: 36 37 retry_after: float ··· 50 51 fetch_log.warn(f"status {response.status} at {url}") 51 52 if response.status == 429: 52 53 # too many retries 53 - retry_after = float(response.headers['retry-after']) 54 + retry_after = float(response.headers["retry-after"]) 54 55 return FetchSetTempDelay(retry_after=retry_after) 55 56 elif response.status != 200: 56 57 text = await response.text()
+14 -10
anyscraper/scrape.py
··· 14 14 log_anchors = Logger("scrape-anchors") 15 15 log_sitemap = Logger("scrape-sitemap") 16 16 17 + 17 18 @dataclass 18 19 class HtmlScrape: 19 20 rel_links: List[Url] 20 21 anchor_hrefs: List[Url] 21 22 23 + 22 24 def is_lang(tag, lang): 23 - actual = tag.get('lang') 25 + actual = tag.get("lang") 24 26 return actual is None or actual == lang 27 + 25 28 26 29 def html(text: str, domain: Domain, lang="en") -> HtmlScrape: 27 30 """ 28 31 parses common link types from raw html 29 32 domain is used for relative links 30 33 """ 31 - soup = BeautifulSoup(text, 'lxml') 34 + soup = BeautifulSoup(text, "lxml") 32 35 33 36 rel_links = [] 34 - for link_tag in soup.select('link[rel][href]'): 37 + for link_tag in soup.select("link[rel][href]"): 35 38 if not is_lang(link_tag, lang): 36 39 continue 37 40 38 - if link_tag.get('rel') == "next": 39 - url = Url.from_quoted_str(cast(str, link_tag.get('href')), domain) 41 + if link_tag.get("rel") == "next": 42 + url = Url.from_quoted_str(cast(str, link_tag.get("href")), domain) 40 43 rel_links.append(url) 41 44 42 45 anchor_hrefs = [] 43 - for a_tag in soup.select('a[href]'): 46 + for a_tag in soup.select("a[href]"): 44 47 if not is_lang(a_tag, lang): 45 48 continue 46 49 47 - url = Url.from_quoted_str(cast(str, a_tag.get('href')), domain) 50 + url = Url.from_quoted_str(cast(str, a_tag.get("href")), domain) 48 51 anchor_hrefs.append(url) 49 52 50 53 return HtmlScrape(rel_links=rel_links, anchor_hrefs=anchor_hrefs) 51 54 55 + 52 56 # TODO support multiple ways of discovering sitemap: https://www.standard-sitemap.org/deployment.php#linking 53 57 54 58 SITEMAP_SCHEMA_HTTP_SITEMAPS_0_9 = Url.from_str( ··· 62 66 def _parse_sitemap_0_9(soup: BeautifulSoup, lang: str) -> Iterator[Tuple[str, str]]: 63 67 """yields ("sitemap" | "url", url_string)""" 64 68 for sitemap in soup.find_all("sitemap"): 65 - yield ("sitemap", sitemap.loc.string) # type: ignore 69 + yield ("sitemap", sitemap.loc.string) # type: ignore 66 70 67 71 for urlset_url in soup.find_all("url"): 68 - yield ("url", urlset_url.loc.string) # type: ignore 72 + yield ("url", urlset_url.loc.string) # type: ignore 69 73 70 74 71 75 # TODO this should probably be redesigned to allow threaded handling of stupid large sitemaps ··· 108 112 # check sitemap xmlns 109 113 xmlns = SITEMAP_SCHEMA_HTTP_SITEMAPS_0_9 110 114 111 - xmlns_el = soup.find(attrs={'name':'xmlns'}) 115 + xmlns_el = soup.find(attrs={"name": "xmlns"}) 112 116 if xmlns_el is not None: 113 117 assert xmlns_el is PageElement 114 118 xmlns = Url.from_str(xmlns_el.get("xmlns"))
+7 -5
anyscraper/url.py
··· 62 62 class Domain: 63 63 def __init__(self, url: str): 64 64 res = urlparse(url) 65 - scheme = "http" if res.scheme == '' else res.scheme 65 + scheme = "http" if res.scheme == "" else res.scheme 66 66 self.clean = f"{scheme}://{res.netloc}/" 67 67 68 68 def __eq__(self, other): ··· 88 88 route: str 89 89 90 90 @classmethod 91 - def from_str(cls, url_str: str, domain: Optional[Domain]=None): 91 + def from_str(cls, url_str: str, domain: Optional[Domain] = None): 92 92 """parses url""" 93 93 res = urlparse(url_str) 94 94 95 - if res.netloc != '': 95 + if res.netloc != "": 96 96 domain = Domain(url_str) 97 97 98 98 route = res.path ··· 103 103 return Url(domain, route) 104 104 105 105 @classmethod 106 - def from_quoted_str(cls, url_str: str, domain: Optional[Domain]=None): 106 + def from_quoted_str(cls, url_str: str, domain: Optional[Domain] = None): 107 107 """parses url that may contain %XX quotes""" 108 108 return Url.from_str(unquote_urllike(url_str), domain=domain) 109 109 110 110 def matches(self, other) -> bool: 111 111 """check if this url matches other, assuming other.route is a rule""" 112 112 assert isinstance(other, Url) 113 - return self.domain == other.domain \ 113 + return ( 114 + self.domain == other.domain 114 115 and match_rule(other.route, self.route) is not None 116 + ) 115 117 116 118 def __eq__(self, other): 117 119 return (
+98
crawl.py
··· 1 + #!/usr/bin/env -S uv run 2 + from typing import Optional, List 3 + import asyncio 4 + from anyscraper import Crawler, CrawlResult, Url, scrape 5 + from argparse import ArgumentParser 6 + import itertools 7 + 8 + SEEN_URLS = set() 9 + 10 + 11 + def should_crawl(url: Url, whitelist: Optional[List[Url]]) -> bool: 12 + if url in SEEN_URLS: 13 + return False 14 + 15 + if whitelist is None: 16 + return True 17 + 18 + for w in whitelist: 19 + if url.matches(w): 20 + return True 21 + 22 + return False 23 + 24 + 25 + async def crawlback(c: Crawler, res: CrawlResult, whitelist: Optional[List[Url]]): 26 + match res.mime.split(";")[0]: 27 + case "text/html": 28 + urls = scrape.html(res.text, res.url.domain) 29 + for url in itertools.chain(urls.rel_links, urls.anchor_hrefs): 30 + if not should_crawl(url, whitelist): 31 + continue 32 + 33 + SEEN_URLS.add(url) 34 + await c.add_url(url, res.url) 35 + case unk: 36 + print(f"unhandled mime type: {unk}") 37 + 38 + 39 + def parse_args(): 40 + parser = ArgumentParser( 41 + description="an example crawler that identifies links for most websites", 42 + ) 43 + parser.add_argument( 44 + "-w", "--whitelist", action="append", help="whitelist this domain + route glob" 45 + ) 46 + parser.add_argument( 47 + "-H", 48 + "--hostile", 49 + action="append", 50 + help="ignore robots.txt for this domain + route glob", 51 + ) 52 + parser.add_argument("seeds", nargs="+", help="seed urls to start from") 53 + 54 + return parser.parse_args() 55 + 56 + 57 + async def run_crawl(args): 58 + whitelist = None 59 + if args.whitelist is not None: 60 + whitelist = list(map(Url.from_quoted_str, args.whitelist)) 61 + 62 + c = Crawler( 63 + "test-crawler", 64 + crawlback=crawlback, 65 + crawlback_kwargs={"whitelist": whitelist}, 66 + ) 67 + 68 + if args.hostile is not None: 69 + allow_rules = {} 70 + 71 + for hostile_url in args.hostile: 72 + url = Url.from_quoted_str(hostile_url) 73 + if url.domain not in allow_rules: 74 + allow_rules[url.domain] = [] 75 + allow_rules[url.domain].append(url.route) 76 + 77 + for domain, rules in allow_rules.items(): 78 + robots_txt = """ 79 + user-agent: * 80 + disallow: 81 + """ + "\n".join( 82 + f"allow: {rule}" for rule in rules 83 + ) 84 + 85 + await c.configure_domain(domain, robots_txt) 86 + 87 + for seed in args.seeds: 88 + await c.add_url(Url.from_quoted_str(seed)) 89 + await c.start() 90 + 91 + 92 + def main(): 93 + args = parse_args() 94 + asyncio.run(run_crawl(args)) 95 + 96 + 97 + if __name__ == "__main__": 98 + main()