add misskey input (untested) · zenfyr.dev/xpost@793b1fd

+3

cross/fragments.py

··· 20 20 @dataclass(kw_only=True) 21 21 class MentionFragment(Fragment): 22 22 uri: str 23 + 24 + 25 + NON_OVERLAPPING: set[type[Fragment]] = {LinkFragment, TagFragment, MentionFragment}

+1 -1

mastodon/input.py

··· 173 173 ) 174 174 175 175 for out in self.outputs: 176 - self.submitter(lambda: out.accept_repost(status["id"], reposted["id"])) 176 + self.submitter(lambda: out.accept_repost(status["id"], reblog["id"])) 177 177 178 178 def _on_delete_post(self, status_id: str): 179 179 post = self._get_post(self.url, self.user_id, status_id)

+25 -112

mastodon/parser.py

··· 1 - from html.parser import HTMLParser 2 1 from typing import override 3 2 import cross.fragments as f 3 + from util.html import HTMLToFragmentsParser 4 4 5 5 6 - class StatusParser(HTMLParser): 6 + class StatusParser(HTMLToFragmentsParser): 7 7 def __init__(self) -> None: 8 8 super().__init__() 9 - self.text: str = "" 10 - self.fragments: list[f.Fragment] = [] 11 - 12 - self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {} 13 - self.in_pre: bool = False 14 - self.in_code: bool = False 15 - 16 - self.invisible: bool = False 17 9 18 10 @override 19 - def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 20 - _attr = dict(attrs) 21 - 22 - def append_newline(): 23 - if self.text and not self.text.endswith("\n"): 24 - self.text += "\n" 25 - 26 - if self.invisible: 27 - return 28 - 29 - match tag: 30 - case "p": 31 - cls = _attr.get('class', '') 32 - if cls and 'quote-inline' in cls: 33 - self.invisible = True 34 - case "a": 35 - self._tag_stack["a"] = (len(self.text), _attr) 36 - case "code": 37 - if not self.in_pre: 38 - self.text += "`" 39 - self.in_code = True 40 - case "pre": 41 - append_newline() 42 - self.text += "```\n" 43 - self.in_pre = True 44 - case "blockquote": 45 - append_newline() 46 - self.text += "> " 47 - case "strong" | "b": 48 - self.text += "**" 49 - case "em" | "i": 50 - self.text += "*" 51 - case "del" | "s": 52 - self.text += "~~" 53 - case "br": 54 - self.text += "\n" 55 - case _: 56 - if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}: 57 - level = int(tag[1]) 58 - self.text += "\n" + "#" * level + " " 59 - 60 - @override 61 - def handle_endtag(self, tag: str) -> None: 62 - if self.invisible: 63 - if tag == "p": 64 - self.invisible = False 65 - return 66 - 11 + def handle_a_endtag(self): 67 12 current_end = len(self.text) 68 - match tag: 69 - case "a": 70 - if "a" in self._tag_stack: 71 - start, _attr = self._tag_stack.pop("a") 13 + start, _attr = self._tag_stack.pop("a") 72 14 73 - href = _attr.get('href') 74 - if href and current_end > start: 75 - cls = _attr.get('class', '') 76 - if cls: 77 - if 'hashtag' in cls: 78 - tag = self.text[start:current_end] 79 - tag = tag[1:] if tag.startswith('#') else tag 80 - 81 - self.fragments.append( 82 - f.TagFragment(start=start, end=current_end, tag=tag) 83 - ) 84 - return 85 - if 'mention' in cls: # TODO put the full acct in the fragment 86 - mention = self.text[start:current_end] 87 - self.fragments.append( 88 - f.MentionFragment(start=start, end=current_end, uri=mention) 89 - ) 90 - return 91 - self.fragments.append( 92 - f.LinkFragment(start=start, end=current_end, url=href) 93 - ) 94 - case "code": 95 - if not self.in_pre and self.in_code: 96 - self.text += "`" 97 - self.in_code = False 98 - case "pre": 99 - self.text += "\n```\n" 100 - self.in_pre = False 101 - case "blockquote": 102 - self.text += "\n" 103 - case "strong" | "b": 104 - self.text += "**" 105 - case "em" | "i": 106 - self.text += "*" 107 - case "del" | "s": 108 - self.text += "~~" 109 - case "p": 110 - self.text += "\n\n" 111 - case _: 112 - if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: 113 - self.text += '\n' 15 + href = _attr.get('href') 16 + if href and current_end > start: 17 + cls = _attr.get('class', '') 18 + if cls: 19 + if 'hashtag' in cls: 20 + tag = self.text[start:current_end] 21 + tag = tag[1:] if tag.startswith('#') else tag 114 22 115 - @override 116 - def handle_data(self, data: str) -> None: 117 - if not self.invisible: 118 - self.text += data 119 - 120 - def get_result(self) -> tuple[str, list[f.Fragment]]: 121 - if self.text.endswith('\n\n'): 122 - return self.text[:-2], self.fragments 123 - return self.text, self.fragments 23 + self.fragments.append( 24 + f.TagFragment(start=start, end=current_end, tag=tag) 25 + ) 26 + return 27 + if 'mention' in cls: # TODO put the full acct in the fragment 28 + mention = self.text[start:current_end] 29 + mention = mention[1:] if mention.startswith('@') else mention 30 + self.fragments.append( 31 + f.MentionFragment(start=start, end=current_end, uri=mention) 32 + ) 33 + return 34 + self.fragments.append( 35 + f.LinkFragment(start=start, end=current_end, url=href) 36 + )

+112 -1

misskey/input.py

··· 7 7 8 8 import websockets 9 9 10 + from cross.attachments import ( 11 + LabelsAttachment, 12 + MediaAttachment, 13 + RemoteUrlAttachment, 14 + SensitiveAttachment, 15 + ) 16 + from cross.media import Blob, download_blob 17 + from cross.post import Post 10 18 from cross.service import InputService 11 19 from database.connection import DatabasePool 12 20 from misskey.info import MisskeyService 21 + from util.markdown import MarkdownParser 13 22 from util.util import normalize_service_url 14 23 15 24 ALLOWED_VISIBILITY = ["public", "home"] ··· 53 62 return self.options.token 54 63 55 64 def _on_note(self, note: dict[str, Any]): 56 - self.log.info(note) # TODO 65 + if note["userId"] != self.user_id: 66 + return 67 + 68 + if note["visibility"] not in self.options.allowed_visibility: 69 + return 70 + 71 + if note.get("poll"): 72 + self.log.info("Skipping '%s'! Contains a poll..", note["id"]) 73 + return 74 + 75 + renote: dict[str, Any] | None = note.get("renote") 76 + if renote: 77 + if note.get("text") is not None: 78 + self.log.info("Skipping '%s'! Quote..", note["id"]) 79 + return 80 + self._on_renote(note, renote) 81 + return 82 + 83 + reply: dict[str, Any] | None = note.get("reply") 84 + if reply: 85 + if reply.get("userId") != self.user_id: 86 + self.log.info("Skipping '%s'! Reply to other user..", note["id"]) 87 + return 88 + 89 + parent = None 90 + if reply: 91 + parent = self._get_post(self.url, self.user_id, reply["id"]) 92 + if not parent: 93 + self.log.info( 94 + "Skipping %s, parent %s not found in db", note["id"], reply["id"] 95 + ) 96 + return 97 + 98 + parser = MarkdownParser() # TODO MFM parser 99 + text, fragments = parser.parse(note.get("text", "")) 100 + post = Post(id=note["id"], parent_id=reply["id"] if reply else None, text=text) 101 + post.fragments.extend(fragments) 102 + 103 + post.attachments.put(RemoteUrlAttachment(url=self.url + "/notes/" + note["id"])) 104 + if any([a.get("isSensitive", False) for a in note.get("files", [])]): 105 + post.attachments.put(SensitiveAttachment(sensitive=True)) 106 + if note.get("cw"): 107 + post.attachments.put(LabelsAttachment(labels=[note["cw"]])) 108 + 109 + blobs: list[Blob] = [] 110 + for media in note.get("files", []): 111 + self.log.info("Downloading %s...", media["url"]) 112 + blob: Blob | None = download_blob(media["url"], media.get("comment", "")) 113 + if not blob: 114 + self.log.error( 115 + "Skipping %s! Failed to download media %s.", 116 + note["id"], 117 + media["url"], 118 + ) 119 + return 120 + blobs.append(blob) 121 + 122 + if blobs: 123 + post.attachments.put(MediaAttachment(blobs=blobs)) 124 + 125 + if parent: 126 + self._insert_post( 127 + { 128 + "user": self.user_id, 129 + "service": self.url, 130 + "identifier": note["id"], 131 + "parent": parent["id"], 132 + "root": parent["id"] if not parent["root"] else parent["root"], 133 + } 134 + ) 135 + else: 136 + self._insert_post( 137 + { 138 + "user": self.user_id, 139 + "service": self.url, 140 + "identifier": note["id"], 141 + } 142 + ) 143 + 144 + for out in self.outputs: 145 + self.submitter(lambda: out.accept_post(post)) 146 + 147 + def _on_renote(self, note: dict[str, Any], renote: dict[str, Any]): 148 + reposted = self._get_post(self.url, self.user_id, renote["id"]) 149 + if not reposted: 150 + self.log.info( 151 + "Skipping repost '%s' as reposted post '%s' was not found in the db.", 152 + note["id"], 153 + renote["id"], 154 + ) 155 + return 156 + 157 + self._insert_post( 158 + { 159 + "user": self.user_id, 160 + "service": self.url, 161 + "identifier": note["id"], 162 + "reposted": reposted["id"], 163 + } 164 + ) 165 + 166 + for out in self.outputs: 167 + self.submitter(lambda: out.accept_repost(note["id"], renote["id"])) 57 168 58 169 def _accept_msg(self, msg: websockets.Data) -> None: 59 170 data: dict[str, Any] = cast(dict[str, Any], json.loads(msg))

+110

util/html.py

··· 1 + from html.parser import HTMLParser 2 + from typing import override 3 + import cross.fragments as f 4 + 5 + 6 + class HTMLToFragmentsParser(HTMLParser): 7 + def __init__(self) -> None: 8 + super().__init__() 9 + self.text: str = "" 10 + self.fragments: list[f.Fragment] = [] 11 + 12 + self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {} 13 + self.in_pre: bool = False 14 + self.in_code: bool = False 15 + 16 + self.invisible: bool = False 17 + 18 + def handle_a_endtag(self): 19 + current_end = len(self.text) 20 + start, _attr = self._tag_stack.pop("a") 21 + 22 + href = _attr.get('href') 23 + if href and current_end > start: 24 + self.fragments.append( 25 + f.LinkFragment(start=start, end=current_end, url=href) 26 + ) 27 + 28 + @override 29 + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 30 + _attr = dict(attrs) 31 + 32 + def append_newline(): 33 + if self.text and not self.text.endswith("\n"): 34 + self.text += "\n" 35 + 36 + if self.invisible: 37 + return 38 + 39 + match tag: 40 + case "p": 41 + cls = _attr.get('class', '') 42 + if cls and 'quote-inline' in cls: 43 + self.invisible = True 44 + case "a": 45 + self._tag_stack["a"] = (len(self.text), _attr) 46 + case "code": 47 + if not self.in_pre: 48 + self.text += "`" 49 + self.in_code = True 50 + case "pre": 51 + append_newline() 52 + self.text += "```\n" 53 + self.in_pre = True 54 + case "blockquote": 55 + append_newline() 56 + self.text += "> " 57 + case "strong" | "b": 58 + self.text += "**" 59 + case "em" | "i": 60 + self.text += "*" 61 + case "del" | "s": 62 + self.text += "~~" 63 + case "br": 64 + self.text += "\n" 65 + case _: 66 + if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}: 67 + level = int(tag[1]) 68 + self.text += "\n" + "#" * level + " " 69 + 70 + @override 71 + def handle_endtag(self, tag: str) -> None: 72 + if self.invisible: 73 + if tag == "p": 74 + self.invisible = False 75 + return 76 + 77 + match tag: 78 + case "a": 79 + if "a" in self._tag_stack: 80 + self.handle_a_endtag() 81 + case "code": 82 + if not self.in_pre and self.in_code: 83 + self.text += "`" 84 + self.in_code = False 85 + case "pre": 86 + self.text += "\n```\n" 87 + self.in_pre = False 88 + case "blockquote": 89 + self.text += "\n" 90 + case "strong" | "b": 91 + self.text += "**" 92 + case "em" | "i": 93 + self.text += "*" 94 + case "del" | "s": 95 + self.text += "~~" 96 + case "p": 97 + self.text += "\n\n" 98 + case _: 99 + if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: 100 + self.text += '\n' 101 + 102 + @override 103 + def handle_data(self, data: str) -> None: 104 + if not self.invisible: 105 + self.text += data 106 + 107 + def get_result(self) -> tuple[str, list[f.Fragment]]: 108 + if self.text.endswith('\n\n'): 109 + return self.text[:-2], self.fragments 110 + return self.text, self.fragments

+143

util/markdown.py

··· 1 + import re 2 + import cross.fragments as f 3 + from util.html import HTMLToFragmentsParser 4 + 5 + URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 6 + MD_INLINE_LINK = re.compile( 7 + r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", 8 + re.IGNORECASE, 9 + ) 10 + MD_AUTOLINK = re.compile( 11 + r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 12 + ) 13 + HASHTAG = re.compile(r"(?<!\w)\#([\w]+)") 14 + FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 15 + 16 + REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE] 17 + 18 + 19 + # TODO autolinks are broken by the html parser 20 + class MarkdownParser: 21 + def parse(self, text: str) -> tuple[str, list[f.Fragment]]: 22 + if not text: 23 + return "", [] 24 + 25 + html_parser = HTMLToFragmentsParser() 26 + html_parser.feed(text) 27 + markdown, fragments = html_parser.get_result() 28 + 29 + index: int = 0 30 + total: int = len(markdown) 31 + 32 + # no match == processed fragments 33 + events: list[tuple[int, int, re.Match[str] | None, str]] = [] 34 + events.extend([(fg.start, fg.end, None, "html") for fg in fragments]) 35 + while index < total: 36 + ch = markdown[index] 37 + rmatch = None 38 + kind = None 39 + 40 + if ch == "[": 41 + rmatch = MD_INLINE_LINK.match(markdown, index) 42 + kind = "inline_link" 43 + # elif ch == '<': 44 + # rmatch = MD_AUTOLINK.match(markdown, index) 45 + # kind = "autolink" 46 + elif ch == "#": 47 + rmatch = HASHTAG.match(markdown, index) 48 + kind = "hashtag" 49 + elif ch == "@": 50 + rmatch = FEDIVERSE_HANDLE.match(markdown, index) 51 + kind = "mention" 52 + else: 53 + rmatch = URL.match(markdown, index) 54 + kind = "url" 55 + 56 + if rmatch: 57 + start, end = rmatch.start(), rmatch.end() 58 + if end == index: 59 + index += 1 60 + continue 61 + events.append((start, end, rmatch, kind)) 62 + index = end 63 + continue 64 + 65 + index += 1 66 + 67 + events.sort(key=lambda x: x[0]) 68 + 69 + # validate fragment positions 70 + last_end: int = 0 71 + for start, end, _, _ in events: 72 + if start > end: 73 + raise Exception(f"Invalid fragment position start={start}, end={end}") 74 + if last_end > start: 75 + raise Exception( 76 + f"Overlapping text fragments at position end={last_end}, start={start}" 77 + ) 78 + last_end = end 79 + 80 + def update_fragments(start: int, s, offset: int): 81 + nonlocal fragments 82 + 83 + for fg in fragments: 84 + if fg != s and fg.start >= start: 85 + fg.start += offset 86 + fg.end += offset 87 + 88 + new_text = "" 89 + last_pos = 0 90 + for start, end, rmatch, event in events: 91 + if start > last_pos: 92 + new_text += markdown[last_pos:start] 93 + 94 + if not rmatch: 95 + new_text += markdown[start:end] 96 + last_pos = end 97 + continue 98 + 99 + match event: 100 + case "inline_link": 101 + label = rmatch.group(1) 102 + href = rmatch.group(2) 103 + fg = f.LinkFragment(start=start, end=start + len(label), url=href) 104 + fragments.append(fg) 105 + update_fragments(start, fg, -(end - (start + len(label)))) 106 + new_text += label 107 + # case "autolink": 108 + # url = rmatch.group(0) 109 + # fg = f.LinkFragment(start=start, end=end - 2, url=url) 110 + # fragments.append(fg) 111 + # update_fragments(start, fg, -2) 112 + # new_text += url 113 + case "hashtag": 114 + tag = rmatch.group(0) 115 + fragments.append( 116 + f.TagFragment( 117 + start=start, 118 + end=end, 119 + tag=tag[1:] if tag.startswith("#") else tag, 120 + ) 121 + ) 122 + new_text += markdown[start:end] 123 + case "mention": 124 + mention = rmatch.group(0) 125 + fragments.append( 126 + f.MentionFragment( 127 + start=start, 128 + end=end, 129 + uri=mention[1:] if mention.startswith("@") else mention, 130 + ) 131 + ) 132 + new_text += markdown[start:end] 133 + case "url": 134 + url = rmatch.group(0) 135 + fragments.append(f.LinkFragment(start=start, end=end, url=url)) 136 + new_text += markdown[start:end] 137 + case _: 138 + pass 139 + last_pos = end 140 + if last_pos < len(markdown): 141 + new_text += markdown[last_pos:] 142 + 143 + return new_text, fragments