tangled
alpha
login
or
join now
zenfyr.dev
/
xpost
2
fork
atom
social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
2
fork
atom
overview
issues
1
pulls
pipelines
add misskey input (untested)
zenfyr.dev
5 months ago
793b1fd8
f223873a
verified
This commit was signed with the committer's
known signature
.
zenfyr.dev
SSH Key Fingerprint:
SHA256:TtcIcnTnoAB5mqHofsaOxIgiMzfVBxej1AXT7DQdrTE=
+394
-114
6 changed files
expand all
collapse all
unified
split
cross
fragments.py
mastodon
input.py
parser.py
misskey
input.py
util
html.py
markdown.py
+3
cross/fragments.py
reviewed
···
20
20
@dataclass(kw_only=True)
21
21
class MentionFragment(Fragment):
22
22
uri: str
23
23
+
24
24
+
25
25
+
NON_OVERLAPPING: set[type[Fragment]] = {LinkFragment, TagFragment, MentionFragment}
+1
-1
mastodon/input.py
reviewed
···
173
173
)
174
174
175
175
for out in self.outputs:
176
176
-
self.submitter(lambda: out.accept_repost(status["id"], reposted["id"]))
176
176
+
self.submitter(lambda: out.accept_repost(status["id"], reblog["id"]))
177
177
178
178
def _on_delete_post(self, status_id: str):
179
179
post = self._get_post(self.url, self.user_id, status_id)
+25
-112
mastodon/parser.py
reviewed
···
1
1
-
from html.parser import HTMLParser
2
1
from typing import override
3
2
import cross.fragments as f
3
3
+
from util.html import HTMLToFragmentsParser
4
4
5
5
6
6
-
class StatusParser(HTMLParser):
6
6
+
class StatusParser(HTMLToFragmentsParser):
7
7
def __init__(self) -> None:
8
8
super().__init__()
9
9
-
self.text: str = ""
10
10
-
self.fragments: list[f.Fragment] = []
11
11
-
12
12
-
self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}
13
13
-
self.in_pre: bool = False
14
14
-
self.in_code: bool = False
15
15
-
16
16
-
self.invisible: bool = False
17
9
18
10
@override
19
19
-
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
20
20
-
_attr = dict(attrs)
21
21
-
22
22
-
def append_newline():
23
23
-
if self.text and not self.text.endswith("\n"):
24
24
-
self.text += "\n"
25
25
-
26
26
-
if self.invisible:
27
27
-
return
28
28
-
29
29
-
match tag:
30
30
-
case "p":
31
31
-
cls = _attr.get('class', '')
32
32
-
if cls and 'quote-inline' in cls:
33
33
-
self.invisible = True
34
34
-
case "a":
35
35
-
self._tag_stack["a"] = (len(self.text), _attr)
36
36
-
case "code":
37
37
-
if not self.in_pre:
38
38
-
self.text += "`"
39
39
-
self.in_code = True
40
40
-
case "pre":
41
41
-
append_newline()
42
42
-
self.text += "```\n"
43
43
-
self.in_pre = True
44
44
-
case "blockquote":
45
45
-
append_newline()
46
46
-
self.text += "> "
47
47
-
case "strong" | "b":
48
48
-
self.text += "**"
49
49
-
case "em" | "i":
50
50
-
self.text += "*"
51
51
-
case "del" | "s":
52
52
-
self.text += "~~"
53
53
-
case "br":
54
54
-
self.text += "\n"
55
55
-
case _:
56
56
-
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
57
57
-
level = int(tag[1])
58
58
-
self.text += "\n" + "#" * level + " "
59
59
-
60
60
-
@override
61
61
-
def handle_endtag(self, tag: str) -> None:
62
62
-
if self.invisible:
63
63
-
if tag == "p":
64
64
-
self.invisible = False
65
65
-
return
66
66
-
11
11
+
def handle_a_endtag(self):
67
12
current_end = len(self.text)
68
68
-
match tag:
69
69
-
case "a":
70
70
-
if "a" in self._tag_stack:
71
71
-
start, _attr = self._tag_stack.pop("a")
13
13
+
start, _attr = self._tag_stack.pop("a")
72
14
73
73
-
href = _attr.get('href')
74
74
-
if href and current_end > start:
75
75
-
cls = _attr.get('class', '')
76
76
-
if cls:
77
77
-
if 'hashtag' in cls:
78
78
-
tag = self.text[start:current_end]
79
79
-
tag = tag[1:] if tag.startswith('#') else tag
80
80
-
81
81
-
self.fragments.append(
82
82
-
f.TagFragment(start=start, end=current_end, tag=tag)
83
83
-
)
84
84
-
return
85
85
-
if 'mention' in cls: # TODO put the full acct in the fragment
86
86
-
mention = self.text[start:current_end]
87
87
-
self.fragments.append(
88
88
-
f.MentionFragment(start=start, end=current_end, uri=mention)
89
89
-
)
90
90
-
return
91
91
-
self.fragments.append(
92
92
-
f.LinkFragment(start=start, end=current_end, url=href)
93
93
-
)
94
94
-
case "code":
95
95
-
if not self.in_pre and self.in_code:
96
96
-
self.text += "`"
97
97
-
self.in_code = False
98
98
-
case "pre":
99
99
-
self.text += "\n```\n"
100
100
-
self.in_pre = False
101
101
-
case "blockquote":
102
102
-
self.text += "\n"
103
103
-
case "strong" | "b":
104
104
-
self.text += "**"
105
105
-
case "em" | "i":
106
106
-
self.text += "*"
107
107
-
case "del" | "s":
108
108
-
self.text += "~~"
109
109
-
case "p":
110
110
-
self.text += "\n\n"
111
111
-
case _:
112
112
-
if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
113
113
-
self.text += '\n'
15
15
+
href = _attr.get('href')
16
16
+
if href and current_end > start:
17
17
+
cls = _attr.get('class', '')
18
18
+
if cls:
19
19
+
if 'hashtag' in cls:
20
20
+
tag = self.text[start:current_end]
21
21
+
tag = tag[1:] if tag.startswith('#') else tag
114
22
115
115
-
@override
116
116
-
def handle_data(self, data: str) -> None:
117
117
-
if not self.invisible:
118
118
-
self.text += data
119
119
-
120
120
-
def get_result(self) -> tuple[str, list[f.Fragment]]:
121
121
-
if self.text.endswith('\n\n'):
122
122
-
return self.text[:-2], self.fragments
123
123
-
return self.text, self.fragments
23
23
+
self.fragments.append(
24
24
+
f.TagFragment(start=start, end=current_end, tag=tag)
25
25
+
)
26
26
+
return
27
27
+
if 'mention' in cls: # TODO put the full acct in the fragment
28
28
+
mention = self.text[start:current_end]
29
29
+
mention = mention[1:] if mention.startswith('@') else mention
30
30
+
self.fragments.append(
31
31
+
f.MentionFragment(start=start, end=current_end, uri=mention)
32
32
+
)
33
33
+
return
34
34
+
self.fragments.append(
35
35
+
f.LinkFragment(start=start, end=current_end, url=href)
36
36
+
)
+112
-1
misskey/input.py
reviewed
···
7
7
8
8
import websockets
9
9
10
10
+
from cross.attachments import (
11
11
+
LabelsAttachment,
12
12
+
MediaAttachment,
13
13
+
RemoteUrlAttachment,
14
14
+
SensitiveAttachment,
15
15
+
)
16
16
+
from cross.media import Blob, download_blob
17
17
+
from cross.post import Post
10
18
from cross.service import InputService
11
19
from database.connection import DatabasePool
12
20
from misskey.info import MisskeyService
21
21
+
from util.markdown import MarkdownParser
13
22
from util.util import normalize_service_url
14
23
15
24
ALLOWED_VISIBILITY = ["public", "home"]
···
53
62
return self.options.token
54
63
55
64
def _on_note(self, note: dict[str, Any]):
56
56
-
self.log.info(note) # TODO
65
65
+
if note["userId"] != self.user_id:
66
66
+
return
67
67
+
68
68
+
if note["visibility"] not in self.options.allowed_visibility:
69
69
+
return
70
70
+
71
71
+
if note.get("poll"):
72
72
+
self.log.info("Skipping '%s'! Contains a poll..", note["id"])
73
73
+
return
74
74
+
75
75
+
renote: dict[str, Any] | None = note.get("renote")
76
76
+
if renote:
77
77
+
if note.get("text") is not None:
78
78
+
self.log.info("Skipping '%s'! Quote..", note["id"])
79
79
+
return
80
80
+
self._on_renote(note, renote)
81
81
+
return
82
82
+
83
83
+
reply: dict[str, Any] | None = note.get("reply")
84
84
+
if reply:
85
85
+
if reply.get("userId") != self.user_id:
86
86
+
self.log.info("Skipping '%s'! Reply to other user..", note["id"])
87
87
+
return
88
88
+
89
89
+
parent = None
90
90
+
if reply:
91
91
+
parent = self._get_post(self.url, self.user_id, reply["id"])
92
92
+
if not parent:
93
93
+
self.log.info(
94
94
+
"Skipping %s, parent %s not found in db", note["id"], reply["id"]
95
95
+
)
96
96
+
return
97
97
+
98
98
+
parser = MarkdownParser() # TODO MFM parser
99
99
+
text, fragments = parser.parse(note.get("text", ""))
100
100
+
post = Post(id=note["id"], parent_id=reply["id"] if reply else None, text=text)
101
101
+
post.fragments.extend(fragments)
102
102
+
103
103
+
post.attachments.put(RemoteUrlAttachment(url=self.url + "/notes/" + note["id"]))
104
104
+
if any([a.get("isSensitive", False) for a in note.get("files", [])]):
105
105
+
post.attachments.put(SensitiveAttachment(sensitive=True))
106
106
+
if note.get("cw"):
107
107
+
post.attachments.put(LabelsAttachment(labels=[note["cw"]]))
108
108
+
109
109
+
blobs: list[Blob] = []
110
110
+
for media in note.get("files", []):
111
111
+
self.log.info("Downloading %s...", media["url"])
112
112
+
blob: Blob | None = download_blob(media["url"], media.get("comment", ""))
113
113
+
if not blob:
114
114
+
self.log.error(
115
115
+
"Skipping %s! Failed to download media %s.",
116
116
+
note["id"],
117
117
+
media["url"],
118
118
+
)
119
119
+
return
120
120
+
blobs.append(blob)
121
121
+
122
122
+
if blobs:
123
123
+
post.attachments.put(MediaAttachment(blobs=blobs))
124
124
+
125
125
+
if parent:
126
126
+
self._insert_post(
127
127
+
{
128
128
+
"user": self.user_id,
129
129
+
"service": self.url,
130
130
+
"identifier": note["id"],
131
131
+
"parent": parent["id"],
132
132
+
"root": parent["id"] if not parent["root"] else parent["root"],
133
133
+
}
134
134
+
)
135
135
+
else:
136
136
+
self._insert_post(
137
137
+
{
138
138
+
"user": self.user_id,
139
139
+
"service": self.url,
140
140
+
"identifier": note["id"],
141
141
+
}
142
142
+
)
143
143
+
144
144
+
for out in self.outputs:
145
145
+
self.submitter(lambda: out.accept_post(post))
146
146
+
147
147
+
def _on_renote(self, note: dict[str, Any], renote: dict[str, Any]):
148
148
+
reposted = self._get_post(self.url, self.user_id, renote["id"])
149
149
+
if not reposted:
150
150
+
self.log.info(
151
151
+
"Skipping repost '%s' as reposted post '%s' was not found in the db.",
152
152
+
note["id"],
153
153
+
renote["id"],
154
154
+
)
155
155
+
return
156
156
+
157
157
+
self._insert_post(
158
158
+
{
159
159
+
"user": self.user_id,
160
160
+
"service": self.url,
161
161
+
"identifier": note["id"],
162
162
+
"reposted": reposted["id"],
163
163
+
}
164
164
+
)
165
165
+
166
166
+
for out in self.outputs:
167
167
+
self.submitter(lambda: out.accept_repost(note["id"], renote["id"]))
57
168
58
169
def _accept_msg(self, msg: websockets.Data) -> None:
59
170
data: dict[str, Any] = cast(dict[str, Any], json.loads(msg))
+110
util/html.py
reviewed
···
1
1
+
from html.parser import HTMLParser
2
2
+
from typing import override
3
3
+
import cross.fragments as f
4
4
+
5
5
+
6
6
+
class HTMLToFragmentsParser(HTMLParser):
7
7
+
def __init__(self) -> None:
8
8
+
super().__init__()
9
9
+
self.text: str = ""
10
10
+
self.fragments: list[f.Fragment] = []
11
11
+
12
12
+
self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}
13
13
+
self.in_pre: bool = False
14
14
+
self.in_code: bool = False
15
15
+
16
16
+
self.invisible: bool = False
17
17
+
18
18
+
def handle_a_endtag(self):
19
19
+
current_end = len(self.text)
20
20
+
start, _attr = self._tag_stack.pop("a")
21
21
+
22
22
+
href = _attr.get('href')
23
23
+
if href and current_end > start:
24
24
+
self.fragments.append(
25
25
+
f.LinkFragment(start=start, end=current_end, url=href)
26
26
+
)
27
27
+
28
28
+
@override
29
29
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
30
30
+
_attr = dict(attrs)
31
31
+
32
32
+
def append_newline():
33
33
+
if self.text and not self.text.endswith("\n"):
34
34
+
self.text += "\n"
35
35
+
36
36
+
if self.invisible:
37
37
+
return
38
38
+
39
39
+
match tag:
40
40
+
case "p":
41
41
+
cls = _attr.get('class', '')
42
42
+
if cls and 'quote-inline' in cls:
43
43
+
self.invisible = True
44
44
+
case "a":
45
45
+
self._tag_stack["a"] = (len(self.text), _attr)
46
46
+
case "code":
47
47
+
if not self.in_pre:
48
48
+
self.text += "`"
49
49
+
self.in_code = True
50
50
+
case "pre":
51
51
+
append_newline()
52
52
+
self.text += "```\n"
53
53
+
self.in_pre = True
54
54
+
case "blockquote":
55
55
+
append_newline()
56
56
+
self.text += "> "
57
57
+
case "strong" | "b":
58
58
+
self.text += "**"
59
59
+
case "em" | "i":
60
60
+
self.text += "*"
61
61
+
case "del" | "s":
62
62
+
self.text += "~~"
63
63
+
case "br":
64
64
+
self.text += "\n"
65
65
+
case _:
66
66
+
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
67
67
+
level = int(tag[1])
68
68
+
self.text += "\n" + "#" * level + " "
69
69
+
70
70
+
@override
71
71
+
def handle_endtag(self, tag: str) -> None:
72
72
+
if self.invisible:
73
73
+
if tag == "p":
74
74
+
self.invisible = False
75
75
+
return
76
76
+
77
77
+
match tag:
78
78
+
case "a":
79
79
+
if "a" in self._tag_stack:
80
80
+
self.handle_a_endtag()
81
81
+
case "code":
82
82
+
if not self.in_pre and self.in_code:
83
83
+
self.text += "`"
84
84
+
self.in_code = False
85
85
+
case "pre":
86
86
+
self.text += "\n```\n"
87
87
+
self.in_pre = False
88
88
+
case "blockquote":
89
89
+
self.text += "\n"
90
90
+
case "strong" | "b":
91
91
+
self.text += "**"
92
92
+
case "em" | "i":
93
93
+
self.text += "*"
94
94
+
case "del" | "s":
95
95
+
self.text += "~~"
96
96
+
case "p":
97
97
+
self.text += "\n\n"
98
98
+
case _:
99
99
+
if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
100
100
+
self.text += '\n'
101
101
+
102
102
+
@override
103
103
+
def handle_data(self, data: str) -> None:
104
104
+
if not self.invisible:
105
105
+
self.text += data
106
106
+
107
107
+
def get_result(self) -> tuple[str, list[f.Fragment]]:
108
108
+
if self.text.endswith('\n\n'):
109
109
+
return self.text[:-2], self.fragments
110
110
+
return self.text, self.fragments
+143
util/markdown.py
reviewed
···
1
1
+
import re
2
2
+
import cross.fragments as f
3
3
+
from util.html import HTMLToFragmentsParser
4
4
+
5
5
+
URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
6
6
+
MD_INLINE_LINK = re.compile(
7
7
+
r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
8
8
+
re.IGNORECASE,
9
9
+
)
10
10
+
MD_AUTOLINK = re.compile(
11
11
+
r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
12
12
+
)
13
13
+
HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
14
14
+
FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
15
15
+
16
16
+
REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
17
17
+
18
18
+
19
19
+
# TODO autolinks are broken by the html parser
20
20
+
class MarkdownParser:
21
21
+
def parse(self, text: str) -> tuple[str, list[f.Fragment]]:
22
22
+
if not text:
23
23
+
return "", []
24
24
+
25
25
+
html_parser = HTMLToFragmentsParser()
26
26
+
html_parser.feed(text)
27
27
+
markdown, fragments = html_parser.get_result()
28
28
+
29
29
+
index: int = 0
30
30
+
total: int = len(markdown)
31
31
+
32
32
+
# no match == processed fragments
33
33
+
events: list[tuple[int, int, re.Match[str] | None, str]] = []
34
34
+
events.extend([(fg.start, fg.end, None, "html") for fg in fragments])
35
35
+
while index < total:
36
36
+
ch = markdown[index]
37
37
+
rmatch = None
38
38
+
kind = None
39
39
+
40
40
+
if ch == "[":
41
41
+
rmatch = MD_INLINE_LINK.match(markdown, index)
42
42
+
kind = "inline_link"
43
43
+
# elif ch == '<':
44
44
+
# rmatch = MD_AUTOLINK.match(markdown, index)
45
45
+
# kind = "autolink"
46
46
+
elif ch == "#":
47
47
+
rmatch = HASHTAG.match(markdown, index)
48
48
+
kind = "hashtag"
49
49
+
elif ch == "@":
50
50
+
rmatch = FEDIVERSE_HANDLE.match(markdown, index)
51
51
+
kind = "mention"
52
52
+
else:
53
53
+
rmatch = URL.match(markdown, index)
54
54
+
kind = "url"
55
55
+
56
56
+
if rmatch:
57
57
+
start, end = rmatch.start(), rmatch.end()
58
58
+
if end == index:
59
59
+
index += 1
60
60
+
continue
61
61
+
events.append((start, end, rmatch, kind))
62
62
+
index = end
63
63
+
continue
64
64
+
65
65
+
index += 1
66
66
+
67
67
+
events.sort(key=lambda x: x[0])
68
68
+
69
69
+
# validate fragment positions
70
70
+
last_end: int = 0
71
71
+
for start, end, _, _ in events:
72
72
+
if start > end:
73
73
+
raise Exception(f"Invalid fragment position start={start}, end={end}")
74
74
+
if last_end > start:
75
75
+
raise Exception(
76
76
+
f"Overlapping text fragments at position end={last_end}, start={start}"
77
77
+
)
78
78
+
last_end = end
79
79
+
80
80
+
def update_fragments(start: int, s, offset: int):
81
81
+
nonlocal fragments
82
82
+
83
83
+
for fg in fragments:
84
84
+
if fg != s and fg.start >= start:
85
85
+
fg.start += offset
86
86
+
fg.end += offset
87
87
+
88
88
+
new_text = ""
89
89
+
last_pos = 0
90
90
+
for start, end, rmatch, event in events:
91
91
+
if start > last_pos:
92
92
+
new_text += markdown[last_pos:start]
93
93
+
94
94
+
if not rmatch:
95
95
+
new_text += markdown[start:end]
96
96
+
last_pos = end
97
97
+
continue
98
98
+
99
99
+
match event:
100
100
+
case "inline_link":
101
101
+
label = rmatch.group(1)
102
102
+
href = rmatch.group(2)
103
103
+
fg = f.LinkFragment(start=start, end=start + len(label), url=href)
104
104
+
fragments.append(fg)
105
105
+
update_fragments(start, fg, -(end - (start + len(label))))
106
106
+
new_text += label
107
107
+
# case "autolink":
108
108
+
# url = rmatch.group(0)
109
109
+
# fg = f.LinkFragment(start=start, end=end - 2, url=url)
110
110
+
# fragments.append(fg)
111
111
+
# update_fragments(start, fg, -2)
112
112
+
# new_text += url
113
113
+
case "hashtag":
114
114
+
tag = rmatch.group(0)
115
115
+
fragments.append(
116
116
+
f.TagFragment(
117
117
+
start=start,
118
118
+
end=end,
119
119
+
tag=tag[1:] if tag.startswith("#") else tag,
120
120
+
)
121
121
+
)
122
122
+
new_text += markdown[start:end]
123
123
+
case "mention":
124
124
+
mention = rmatch.group(0)
125
125
+
fragments.append(
126
126
+
f.MentionFragment(
127
127
+
start=start,
128
128
+
end=end,
129
129
+
uri=mention[1:] if mention.startswith("@") else mention,
130
130
+
)
131
131
+
)
132
132
+
new_text += markdown[start:end]
133
133
+
case "url":
134
134
+
url = rmatch.group(0)
135
135
+
fragments.append(f.LinkFragment(start=start, end=end, url=url))
136
136
+
new_text += markdown[start:end]
137
137
+
case _:
138
138
+
pass
139
139
+
last_pos = end
140
140
+
if last_pos < len(markdown):
141
141
+
new_text += markdown[last_pos:]
142
142
+
143
143
+
return new_text, fragments