more work, convert parsers to use utf8 offsets

+87

bluesky/facets.py

··· 1 + from typing import Any, override 2 + import cross.fragments as f 3 + from util.splitter import FragmentSplitter, canonical_label 4 + 5 + LINK = 'app.bsky.richtext.facet#link' 6 + TAG = 'app.bsky.richtext.facet#tag' 7 + MENTION = "app.bsky.richtext.facet#mention" 8 + 9 + class BskySplitter(FragmentSplitter): 10 + def __init__(self): 11 + super().__init__(300, 30) 12 + 13 + @override 14 + def normalize_link(self, label: str, url: str) -> str: 15 + if canonical_label(label, url): 16 + nlabel = url.split("://", 1)[1] 17 + if len(nlabel) <= self.urllen: 18 + return nlabel 19 + return nlabel[: self.urllen - 1] + "…" 20 + return label 21 + 22 + # TODO handle extending overlapping fragments somehow 23 + def parse_facets( 24 + text: str, 25 + facets: list[dict[str, Any]] | None 26 + ) -> tuple[str, list[f.Fragment]]: 27 + if not facets: 28 + return text, [] 29 + 30 + btext = text.encode("utf-8") 31 + nbytes = bytearray() 32 + last_original_byte_index = 0 33 + fragments: list[f.Fragment] = [] 34 + 35 + for facet in facets: 36 + original_start: int = facet['index']['byteStart'] 37 + original_end: int = facet['index']['byteEnd'] 38 + 39 + if last_original_byte_index < original_start: 40 + nbytes.extend(btext[last_original_byte_index:original_start]) 41 + 42 + fdict = {feat['$type']: feat for feat in facet.get('features', [])} 43 + 44 + original_label_bytes = btext[original_start:original_end] 45 + original_label_str = original_label_bytes.decode("utf-8") 46 + 47 + nlabel_bytes = original_label_bytes 48 + 49 + if LINK in fdict: 50 + url: str = fdict.pop(LINK)['uri'] 51 + label = original_label_str 52 + 53 + split = url.split("://", 1) 54 + full_url = False 55 + if len(split) > 1: 56 + if split[1].startswith(label): 57 + full_url = True 58 + if label.endswith("...") and split[1].startswith(label[:-3]): 59 + full_url = True 60 + 61 + if full_url: 62 + nlabel_bytes = url.encode("utf-8") 63 + 64 + nstart = len(nbytes) 65 + nbytes.extend(nlabel_bytes) 66 + nend = len(nbytes) 67 + 68 + fragments.append(f.LinkFragment(start=nstart, end=nend, url=url)) 69 + else: 70 + nstart = len(nbytes) 71 + nbytes.extend(nlabel_bytes) 72 + nend = len(nbytes) 73 + 74 + if TAG in fdict: 75 + tag: str = fdict.pop(TAG)['tag'] 76 + fragments.append(f.TagFragment(start=nstart, end=nend, tag=tag)) 77 + 78 + if MENTION in fdict: 79 + did: str = fdict.pop(MENTION)['did'] 80 + fragments.append(f.MentionFragment(start=nstart, end=nend, uri=did)) 81 + 82 + last_original_byte_index = original_end 83 + 84 + if last_original_byte_index < len(btext): 85 + nbytes.extend(btext[last_original_byte_index:]) 86 + 87 + return nbytes.decode("utf-8"), fragments

+5 -2

bluesky/input.py

··· 8 8 import websockets 9 9 10 10 from atproto.util import AtUri 11 + from bluesky.facets import parse_facets 11 12 from bluesky.info import SERVICE, BlueskyService, validate_and_transform 12 13 from cross.attachments import ( 13 14 LabelsAttachment, ··· 75 76 ) 76 77 return 77 78 78 - # TODO FRAGMENTS 79 - post = Post(id=post_uri, parent_id=parent_uri, text=record["text"]) 79 + text, fragments = parse_facets(record["text"], record.get('facets')) 80 + post = Post(id=post_uri, parent_id=parent_uri, text=text) 81 + post.fragments.extend(fragments) 82 + 80 83 did, _, rid = AtUri.record_uri(post_uri) 81 84 post.attachments.put( 82 85 RemoteUrlAttachment(url=f"https://bsky.app/profile/{did}/post/{rid}")

+25 -27

util/html.py

··· 2 2 from typing import override 3 3 import cross.fragments as f 4 4 5 - 6 5 class HTMLToFragmentsParser(HTMLParser): 7 6 def __init__(self) -> None: 8 7 super().__init__() 9 - self.text: str = "" 8 + self.builder: bytearray = bytearray() 10 9 self.fragments: list[f.Fragment] = [] 11 10 12 11 self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {} 13 12 self.in_pre: bool = False 14 13 self.in_code: bool = False 15 - 16 14 self.invisible: bool = False 17 15 18 16 def handle_a_endtag(self): 19 - current_end = len(self.text) 17 + current_end = len(self.builder) 20 18 start, _attr = self._tag_stack.pop("a") 21 19 22 20 href = _attr.get('href') ··· 30 28 _attr = dict(attrs) 31 29 32 30 def append_newline(): 33 - if self.text and not self.text.endswith("\n"): 34 - self.text += "\n" 31 + if self.builder and not self.builder.endswith(b"\n"): 32 + self.builder.extend(b"\n") 35 33 36 34 if self.invisible: 37 35 return ··· 42 40 if cls and 'quote-inline' in cls: 43 41 self.invisible = True 44 42 case "a": 45 - self._tag_stack["a"] = (len(self.text), _attr) 43 + self._tag_stack["a"] = (len(self.builder), _attr) 46 44 case "code": 47 45 if not self.in_pre: 48 - self.text += "`" 46 + self.builder.extend(b"`") 49 47 self.in_code = True 50 48 case "pre": 51 49 append_newline() 52 - self.text += "```\n" 50 + self.builder.extend(b"```\n") 53 51 self.in_pre = True 54 52 case "blockquote": 55 53 append_newline() 56 - self.text += "> " 54 + self.builder.extend(b"> ") 57 55 case "strong" | "b": 58 - self.text += "**" 56 + self.builder.extend(b"**") 59 57 case "em" | "i": 60 - self.text += "*" 58 + self.builder.extend(b"*") 61 59 case "del" | "s": 62 - self.text += "~~" 60 + self.builder.extend(b"~~") 63 61 case "br": 64 - self.text += "\n" 62 + self.builder.extend(b"\n") 65 63 case _: 66 64 if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}: 67 65 level = int(tag[1]) 68 - self.text += "\n" + "#" * level + " " 66 + self.builder.extend(("\n" + "#" * level + " ").encode('utf-8')) 69 67 70 68 @override 71 69 def handle_endtag(self, tag: str) -> None: ··· 80 78 self.handle_a_endtag() 81 79 case "code": 82 80 if not self.in_pre and self.in_code: 83 - self.text += "`" 81 + self.builder.extend(b"`") 84 82 self.in_code = False 85 83 case "pre": 86 - self.text += "\n```\n" 84 + self.builder.extend(b"\n```\n") 87 85 self.in_pre = False 88 86 case "blockquote": 89 - self.text += "\n" 87 + self.builder.extend(b"\n") 90 88 case "strong" | "b": 91 - self.text += "**" 89 + self.builder.extend(b"**") 92 90 case "em" | "i": 93 - self.text += "*" 91 + self.builder.extend(b"*") 94 92 case "del" | "s": 95 - self.text += "~~" 93 + self.builder.extend(b"~~") 96 94 case "p": 97 - self.text += "\n\n" 95 + self.builder.extend(b"\n\n") 98 96 case _: 99 97 if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: 100 - self.text += '\n' 98 + self.builder.extend(b'\n') 101 99 102 100 @override 103 101 def handle_data(self, data: str) -> None: 104 102 if not self.invisible: 105 - self.text += data 103 + self.builder.extend(data.encode('utf-8')) 106 104 107 105 def get_result(self) -> tuple[str, list[f.Fragment]]: 108 - if self.text.endswith('\n\n'): 109 - return self.text[:-2], self.fragments 110 - return self.text, self.fragments 106 + if self.builder.endswith(b'\n\n'): 107 + return self.builder[:-2].decode('utf-8'), self.fragments 108 + return self.builder.decode('utf-8'), self.fragments

+71 -43

util/markdown.py

··· 3 3 import cross.fragments as f 4 4 from util.html import HTMLToFragmentsParser 5 5 6 - URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 6 + URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 7 7 MD_INLINE_LINK = re.compile( 8 - r"\[([^\]]+)\]$\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s$]+)\s*\)", 8 + rb"\[([^\]]+)\]$\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s$]+)\s*\)", 9 9 re.IGNORECASE, 10 10 ) 11 11 MD_AUTOLINK = re.compile( 12 - r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 12 + rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 13 13 ) 14 - HASHTAG = re.compile(r"(?<!\w)\#([\w]+)") 15 - FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 14 + HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)") 15 + FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 16 16 17 17 REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE] 18 18 ··· 27 27 html_parser.feed(text) 28 28 markdown, fragments = html_parser.get_result() 29 29 30 + markdown_bytes: bytes = markdown.encode("utf-8") 31 + 30 32 index: int = 0 31 - total: int = len(markdown) 33 + total: int = len(markdown_bytes) 32 34 33 - # no match == processed fragments 34 - events: list[tuple[int, int, re.Match[str] | f.Fragment, str]] = [] 35 + events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = [] 35 36 events.extend([(fg.start, fg.end, fg, "html") for fg in fragments]) 37 + 36 38 while index < total: 37 - ch = markdown[index] 38 - rmatch = None 39 + ch: int = markdown_bytes[index] 40 + rmatch: re.Match[bytes] | None = None 39 41 kind = None 40 42 41 - if ch == "[": 42 - rmatch = MD_INLINE_LINK.match(markdown, index) 43 + if ch == b"["[0]: 44 + rmatch = MD_INLINE_LINK.match(markdown_bytes, index) 43 45 kind = "inline_link" 44 - # elif ch == '<': 45 - # rmatch = MD_AUTOLINK.match(markdown, index) 46 - # kind = "autolink" 47 - elif ch == "#": 48 - rmatch = HASHTAG.match(markdown, index) 46 + # elif ch == b"<"[0]: 47 + # rmatch = MD_AUTOLINK.match(markdown_bytes, index) 48 + # kind = "autolink" 49 + elif ch == b"#"[0]: 50 + rmatch = HASHTAG.match(markdown_bytes, index) 49 51 kind = "hashtag" 50 - elif ch == "@": 51 - rmatch = FEDIVERSE_HANDLE.match(markdown, index) 52 + elif ch == b"@"[0]: 53 + rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index) 52 54 kind = "mention" 53 55 else: 54 - rmatch = URL.match(markdown, index) 56 + rmatch = URL.match(markdown_bytes, index) 55 57 kind = "url" 56 58 57 59 if rmatch: ··· 67 69 68 70 events.sort(key=lambda x: x[0]) 69 71 70 - # validate fragment positions 71 72 last_end: int = 0 72 73 for start, end, _, _ in events: 73 74 if start > end: ··· 78 79 ) 79 80 last_end = end 80 81 81 - ntext: list[str] = [] 82 + ntext: bytearray = bytearray() 82 83 nfragments: list[f.Fragment] = [] 83 84 84 85 offset: int = 0 85 86 last_index: int = 0 86 87 87 - events.sort(key=lambda x: x[0]) 88 88 for start, end, rmatch, event in events: 89 - ntext.append(markdown[last_index:start]) 89 + ntext.extend(markdown_bytes[last_index:start]) 90 90 91 91 if isinstance(rmatch, f.Fragment): 92 - ntext.append(markdown[start:end]) 92 + ntext.extend(markdown_bytes[start:end]) 93 93 nfg = replace(rmatch, start=start + offset, end=end + offset) 94 94 nfragments.append(nfg) 95 95 last_index = end 96 96 continue 97 97 98 98 nstart = start + offset 99 - nend = end + offset 100 99 match event: 101 100 case "inline_link": 102 - label = rmatch.group(1) 103 - href = rmatch.group(2) 104 - ntext.append(label) 101 + label_bytes: bytes = rmatch.group(1) 102 + href_bytes: bytes = rmatch.group(2) 105 103 106 - delta = len(label) - (end - start) 104 + ntext.extend(label_bytes) 105 + 106 + delta = len(label_bytes) - (end - start) 107 107 offset += delta 108 108 109 - nfragments.append(f.LinkFragment(start=nstart, end=nstart + len(label), url=href)) 109 + nend = nstart + len(label_bytes) 110 + nfragments.append( 111 + f.LinkFragment( 112 + start=nstart, end=nend, url=href_bytes.decode("utf-8") 113 + ) 114 + ) 115 + 110 116 case "hashtag": 111 - tag = rmatch.group(1) 112 - ntext.append(markdown[start:end]) 113 - nfragments.append(f.TagFragment(start=nstart, end=nend, tag=tag)) 117 + tag_bytes: bytes = rmatch.group(1) 118 + ntext.extend(markdown_bytes[start:end]) 119 + nend = end + offset 120 + nfragments.append( 121 + f.TagFragment( 122 + start=nstart, end=nend, tag=tag_bytes.decode("utf-8") 123 + ) 124 + ) 125 + 114 126 case "mention": 115 - mention = rmatch.group(0) 116 - ntext.append(markdown[start:end]) 117 - mention = mention[1:] if mention.startswith("@") else mention 118 - nfragments.append(f.MentionFragment(start=nstart, end=nend, uri=mention)) 127 + mention_bytes: bytes = rmatch.group(0) 128 + ntext.extend(markdown_bytes[start:end]) 129 + 130 + mention_str = mention_bytes.decode("utf-8") 131 + mention_str = ( 132 + mention_str[1:] if mention_str.startswith("@") else mention_str 133 + ) 134 + 135 + nend = end + offset 136 + nfragments.append( 137 + f.MentionFragment(start=nstart, end=nend, uri=mention_str) 138 + ) 139 + 119 140 case "url": 120 - url = rmatch.group(0) 121 - ntext.append(markdown[start:end]) 122 - nfragments.append(f.LinkFragment(start=nstart, end=nend, url=url)) 141 + url_bytes: bytes = rmatch.group(0) 142 + ntext.extend(markdown_bytes[start:end]) 143 + nend = end + offset 144 + nfragments.append( 145 + f.LinkFragment( 146 + start=nstart, end=nend, url=url_bytes.decode("utf-8") 147 + ) 148 + ) 149 + 123 150 case _: 124 151 pass 125 152 last_index = end 126 - ntext.append(markdown[last_index:]) 153 + 154 + ntext.extend(markdown_bytes[last_index:]) 127 155 128 - return ''.join(ntext), nfragments 156 + return ntext.decode("utf-8"), nfragments

+36 -32

util/splitter.py

··· 21 21 self.urllen: int = urllen 22 22 23 23 def normalize_link(self, label: str, url: str) -> str: 24 - #if canonical_label(label, url): 25 - # if self.urltrunc == "dotted": 26 - # nlabel = url.split("://", 1)[1] 27 - # if len(nlabel) <= self.urllen: 28 - # return nlabel 29 - # return nlabel[: self.urllen - 1] + "…" 30 24 return label 25 + 26 + def tally_lenght(self, post: tuple[str, list[Fragment]]): 27 + return grapheme.length(post[0]) 31 28 32 29 def url_normalize( 33 - self, text: str, fragments: list[Fragment] 34 - ) -> tuple[str, list[Fragment]]: 35 - if self.urllen == -1: 36 - return text, fragments 30 + self, text: str, fragments: list[Fragment] 31 + ) -> tuple[str, list[Fragment]]: 32 + if self.urllen == -1: 33 + return text, fragments 34 + btext = text.encode('utf-8') 37 35 38 - ntext: list[str] = [] 39 - nfragments: list[Fragment] = [] 36 + nbytes = bytearray() 37 + nfragments: list[Fragment] = [] 40 38 41 - offset: int = 0 42 - last_index: int = 0 39 + fragments = [fg for fg in fragments] 40 + fragments.sort(key=lambda x: x.start) 43 41 44 - fragments = [fg for fg in fragments] 45 - fragments.sort(key=lambda x: x.start) 42 + last_index = 0 46 43 47 - for fg in fragments: 48 - ntext.append(text[last_index:fg.start]) 49 - label = text[fg.start:fg.end] 50 - nlabel = label 51 - if isinstance(fg, LinkFragment): 52 - nlabel = self.normalize_link(nlabel, fg.url) 53 - ntext.append(nlabel) 44 + for fg in fragments: 45 + if last_index < fg.start: 46 + nbytes.extend(btext[last_index:fg.start]) 54 47 55 - nfg = replace(fg, start=fg.start + offset) 56 - change = len(nlabel) - len(label) 57 - offset += change 58 - nfg = replace(nfg, end=fg.end + offset) 48 + label_bytes = btext[fg.start:fg.end] 49 + label = label_bytes.decode('utf-8') 59 50 60 - nfragments.append(nfg) 61 - last_index = fg.end 51 + nlabel = label 52 + if isinstance(fg, LinkFragment): 53 + nlabel = self.normalize_link(nlabel, fg.url) 62 54 63 - ntext.append(text[last_index:]) 55 + nlabel_bytes = nlabel.encode('utf-8') 64 56 65 - return ''.join(ntext), nfragments 57 + nstart = len(nbytes) 58 + nbytes.extend(nlabel_bytes) 59 + nend = len(nbytes) 60 + 61 + nfg = replace(fg, start=nstart, end=nend) 62 + nfragments.append(nfg) 63 + 64 + last_index = fg.end 65 + 66 + if last_index < len(btext): 67 + nbytes.extend(btext[last_index:]) 68 + 69 + return nbytes.decode('utf-8'), nfragments 66 70 67 71 def split( 68 72 self, text: str, fragments: list[Fragment] 69 73 ) -> list[tuple[str, list[Fragment]]]: 70 74 text, fragments = self.url_normalize(text, fragments) 71 - if grapheme.length(text) <= self.climit: 75 + if self.tally_lenght((text, fragments)) <= self.climit: 72 76 return [(text, fragments)]