tangled
alpha
login
or
join now
zenfyr.dev
/
xpost
2
fork
atom
social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
2
fork
atom
overview
issues
1
pulls
pipelines
more work, convert parsers to use utf8 offsets
zenfyr.dev
5 months ago
749c26db
7e647c4b
verified
This commit was signed with the committer's
known signature
.
zenfyr.dev
SSH Key Fingerprint:
SHA256:TtcIcnTnoAB5mqHofsaOxIgiMzfVBxej1AXT7DQdrTE=
+224
-104
5 changed files
expand all
collapse all
unified
split
bluesky
facets.py
input.py
util
html.py
markdown.py
splitter.py
+87
bluesky/facets.py
reviewed
···
1
1
+
from typing import Any, override
2
2
+
import cross.fragments as f
3
3
+
from util.splitter import FragmentSplitter, canonical_label
4
4
+
5
5
+
LINK = 'app.bsky.richtext.facet#link'
6
6
+
TAG = 'app.bsky.richtext.facet#tag'
7
7
+
MENTION = "app.bsky.richtext.facet#mention"
8
8
+
9
9
+
class BskySplitter(FragmentSplitter):
10
10
+
def __init__(self):
11
11
+
super().__init__(300, 30)
12
12
+
13
13
+
@override
14
14
+
def normalize_link(self, label: str, url: str) -> str:
15
15
+
if canonical_label(label, url):
16
16
+
nlabel = url.split("://", 1)[1]
17
17
+
if len(nlabel) <= self.urllen:
18
18
+
return nlabel
19
19
+
return nlabel[: self.urllen - 1] + "…"
20
20
+
return label
21
21
+
22
22
+
# TODO handle extending overlapping fragments somehow
23
23
+
def parse_facets(
24
24
+
text: str,
25
25
+
facets: list[dict[str, Any]] | None
26
26
+
) -> tuple[str, list[f.Fragment]]:
27
27
+
if not facets:
28
28
+
return text, []
29
29
+
30
30
+
btext = text.encode("utf-8")
31
31
+
nbytes = bytearray()
32
32
+
last_original_byte_index = 0
33
33
+
fragments: list[f.Fragment] = []
34
34
+
35
35
+
for facet in facets:
36
36
+
original_start: int = facet['index']['byteStart']
37
37
+
original_end: int = facet['index']['byteEnd']
38
38
+
39
39
+
if last_original_byte_index < original_start:
40
40
+
nbytes.extend(btext[last_original_byte_index:original_start])
41
41
+
42
42
+
fdict = {feat['$type']: feat for feat in facet.get('features', [])}
43
43
+
44
44
+
original_label_bytes = btext[original_start:original_end]
45
45
+
original_label_str = original_label_bytes.decode("utf-8")
46
46
+
47
47
+
nlabel_bytes = original_label_bytes
48
48
+
49
49
+
if LINK in fdict:
50
50
+
url: str = fdict.pop(LINK)['uri']
51
51
+
label = original_label_str
52
52
+
53
53
+
split = url.split("://", 1)
54
54
+
full_url = False
55
55
+
if len(split) > 1:
56
56
+
if split[1].startswith(label):
57
57
+
full_url = True
58
58
+
if label.endswith("...") and split[1].startswith(label[:-3]):
59
59
+
full_url = True
60
60
+
61
61
+
if full_url:
62
62
+
nlabel_bytes = url.encode("utf-8")
63
63
+
64
64
+
nstart = len(nbytes)
65
65
+
nbytes.extend(nlabel_bytes)
66
66
+
nend = len(nbytes)
67
67
+
68
68
+
fragments.append(f.LinkFragment(start=nstart, end=nend, url=url))
69
69
+
else:
70
70
+
nstart = len(nbytes)
71
71
+
nbytes.extend(nlabel_bytes)
72
72
+
nend = len(nbytes)
73
73
+
74
74
+
if TAG in fdict:
75
75
+
tag: str = fdict.pop(TAG)['tag']
76
76
+
fragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))
77
77
+
78
78
+
if MENTION in fdict:
79
79
+
did: str = fdict.pop(MENTION)['did']
80
80
+
fragments.append(f.MentionFragment(start=nstart, end=nend, uri=did))
81
81
+
82
82
+
last_original_byte_index = original_end
83
83
+
84
84
+
if last_original_byte_index < len(btext):
85
85
+
nbytes.extend(btext[last_original_byte_index:])
86
86
+
87
87
+
return nbytes.decode("utf-8"), fragments
+5
-2
bluesky/input.py
reviewed
···
8
8
import websockets
9
9
10
10
from atproto.util import AtUri
11
11
+
from bluesky.facets import parse_facets
11
12
from bluesky.info import SERVICE, BlueskyService, validate_and_transform
12
13
from cross.attachments import (
13
14
LabelsAttachment,
···
75
76
)
76
77
return
77
78
78
78
-
# TODO FRAGMENTS
79
79
-
post = Post(id=post_uri, parent_id=parent_uri, text=record["text"])
79
79
+
text, fragments = parse_facets(record["text"], record.get('facets'))
80
80
+
post = Post(id=post_uri, parent_id=parent_uri, text=text)
81
81
+
post.fragments.extend(fragments)
82
82
+
80
83
did, _, rid = AtUri.record_uri(post_uri)
81
84
post.attachments.put(
82
85
RemoteUrlAttachment(url=f"https://bsky.app/profile/{did}/post/{rid}")
+25
-27
util/html.py
reviewed
···
2
2
from typing import override
3
3
import cross.fragments as f
4
4
5
5
-
6
5
class HTMLToFragmentsParser(HTMLParser):
7
6
def __init__(self) -> None:
8
7
super().__init__()
9
9
-
self.text: str = ""
8
8
+
self.builder: bytearray = bytearray()
10
9
self.fragments: list[f.Fragment] = []
11
10
12
11
self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}
13
12
self.in_pre: bool = False
14
13
self.in_code: bool = False
15
15
-
16
14
self.invisible: bool = False
17
15
18
16
def handle_a_endtag(self):
19
19
-
current_end = len(self.text)
17
17
+
current_end = len(self.builder)
20
18
start, _attr = self._tag_stack.pop("a")
21
19
22
20
href = _attr.get('href')
···
30
28
_attr = dict(attrs)
31
29
32
30
def append_newline():
33
33
-
if self.text and not self.text.endswith("\n"):
34
34
-
self.text += "\n"
31
31
+
if self.builder and not self.builder.endswith(b"\n"):
32
32
+
self.builder.extend(b"\n")
35
33
36
34
if self.invisible:
37
35
return
···
42
40
if cls and 'quote-inline' in cls:
43
41
self.invisible = True
44
42
case "a":
45
45
-
self._tag_stack["a"] = (len(self.text), _attr)
43
43
+
self._tag_stack["a"] = (len(self.builder), _attr)
46
44
case "code":
47
45
if not self.in_pre:
48
48
-
self.text += "`"
46
46
+
self.builder.extend(b"`")
49
47
self.in_code = True
50
48
case "pre":
51
49
append_newline()
52
52
-
self.text += "```\n"
50
50
+
self.builder.extend(b"```\n")
53
51
self.in_pre = True
54
52
case "blockquote":
55
53
append_newline()
56
56
-
self.text += "> "
54
54
+
self.builder.extend(b"> ")
57
55
case "strong" | "b":
58
58
-
self.text += "**"
56
56
+
self.builder.extend(b"**")
59
57
case "em" | "i":
60
60
-
self.text += "*"
58
58
+
self.builder.extend(b"*")
61
59
case "del" | "s":
62
62
-
self.text += "~~"
60
60
+
self.builder.extend(b"~~")
63
61
case "br":
64
64
-
self.text += "\n"
62
62
+
self.builder.extend(b"\n")
65
63
case _:
66
64
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
67
65
level = int(tag[1])
68
68
-
self.text += "\n" + "#" * level + " "
66
66
+
self.builder.extend(("\n" + "#" * level + " ").encode('utf-8'))
69
67
70
68
@override
71
69
def handle_endtag(self, tag: str) -> None:
···
80
78
self.handle_a_endtag()
81
79
case "code":
82
80
if not self.in_pre and self.in_code:
83
83
-
self.text += "`"
81
81
+
self.builder.extend(b"`")
84
82
self.in_code = False
85
83
case "pre":
86
86
-
self.text += "\n```\n"
84
84
+
self.builder.extend(b"\n```\n")
87
85
self.in_pre = False
88
86
case "blockquote":
89
89
-
self.text += "\n"
87
87
+
self.builder.extend(b"\n")
90
88
case "strong" | "b":
91
91
-
self.text += "**"
89
89
+
self.builder.extend(b"**")
92
90
case "em" | "i":
93
93
-
self.text += "*"
91
91
+
self.builder.extend(b"*")
94
92
case "del" | "s":
95
95
-
self.text += "~~"
93
93
+
self.builder.extend(b"~~")
96
94
case "p":
97
97
-
self.text += "\n\n"
95
95
+
self.builder.extend(b"\n\n")
98
96
case _:
99
97
if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
100
100
-
self.text += '\n'
98
98
+
self.builder.extend(b'\n')
101
99
102
100
@override
103
101
def handle_data(self, data: str) -> None:
104
102
if not self.invisible:
105
105
-
self.text += data
103
103
+
self.builder.extend(data.encode('utf-8'))
106
104
107
105
def get_result(self) -> tuple[str, list[f.Fragment]]:
108
108
-
if self.text.endswith('\n\n'):
109
109
-
return self.text[:-2], self.fragments
110
110
-
return self.text, self.fragments
106
106
+
if self.builder.endswith(b'\n\n'):
107
107
+
return self.builder[:-2].decode('utf-8'), self.fragments
108
108
+
return self.builder.decode('utf-8'), self.fragments
+71
-43
util/markdown.py
reviewed
···
3
3
import cross.fragments as f
4
4
from util.html import HTMLToFragmentsParser
5
5
6
6
-
URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
6
6
+
URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
7
7
MD_INLINE_LINK = re.compile(
8
8
-
r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
8
8
+
rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
9
9
re.IGNORECASE,
10
10
)
11
11
MD_AUTOLINK = re.compile(
12
12
-
r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
12
12
+
rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
13
13
)
14
14
-
HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
15
15
-
FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
14
14
+
HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)")
15
15
+
FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
16
16
17
17
REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
18
18
···
27
27
html_parser.feed(text)
28
28
markdown, fragments = html_parser.get_result()
29
29
30
30
+
markdown_bytes: bytes = markdown.encode("utf-8")
31
31
+
30
32
index: int = 0
31
31
-
total: int = len(markdown)
33
33
+
total: int = len(markdown_bytes)
32
34
33
33
-
# no match == processed fragments
34
34
-
events: list[tuple[int, int, re.Match[str] | f.Fragment, str]] = []
35
35
+
events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = []
35
36
events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
37
37
+
36
38
while index < total:
37
37
-
ch = markdown[index]
38
38
-
rmatch = None
39
39
+
ch: int = markdown_bytes[index]
40
40
+
rmatch: re.Match[bytes] | None = None
39
41
kind = None
40
42
41
41
-
if ch == "[":
42
42
-
rmatch = MD_INLINE_LINK.match(markdown, index)
43
43
+
if ch == b"["[0]:
44
44
+
rmatch = MD_INLINE_LINK.match(markdown_bytes, index)
43
45
kind = "inline_link"
44
44
-
# elif ch == '<':
45
45
-
# rmatch = MD_AUTOLINK.match(markdown, index)
46
46
-
# kind = "autolink"
47
47
-
elif ch == "#":
48
48
-
rmatch = HASHTAG.match(markdown, index)
46
46
+
# elif ch == b"<"[0]:
47
47
+
# rmatch = MD_AUTOLINK.match(markdown_bytes, index)
48
48
+
# kind = "autolink"
49
49
+
elif ch == b"#"[0]:
50
50
+
rmatch = HASHTAG.match(markdown_bytes, index)
49
51
kind = "hashtag"
50
50
-
elif ch == "@":
51
51
-
rmatch = FEDIVERSE_HANDLE.match(markdown, index)
52
52
+
elif ch == b"@"[0]:
53
53
+
rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index)
52
54
kind = "mention"
53
55
else:
54
54
-
rmatch = URL.match(markdown, index)
56
56
+
rmatch = URL.match(markdown_bytes, index)
55
57
kind = "url"
56
58
57
59
if rmatch:
···
67
69
68
70
events.sort(key=lambda x: x[0])
69
71
70
70
-
# validate fragment positions
71
72
last_end: int = 0
72
73
for start, end, _, _ in events:
73
74
if start > end:
···
78
79
)
79
80
last_end = end
80
81
81
81
-
ntext: list[str] = []
82
82
+
ntext: bytearray = bytearray()
82
83
nfragments: list[f.Fragment] = []
83
84
84
85
offset: int = 0
85
86
last_index: int = 0
86
87
87
87
-
events.sort(key=lambda x: x[0])
88
88
for start, end, rmatch, event in events:
89
89
-
ntext.append(markdown[last_index:start])
89
89
+
ntext.extend(markdown_bytes[last_index:start])
90
90
91
91
if isinstance(rmatch, f.Fragment):
92
92
-
ntext.append(markdown[start:end])
92
92
+
ntext.extend(markdown_bytes[start:end])
93
93
nfg = replace(rmatch, start=start + offset, end=end + offset)
94
94
nfragments.append(nfg)
95
95
last_index = end
96
96
continue
97
97
98
98
nstart = start + offset
99
99
-
nend = end + offset
100
99
match event:
101
100
case "inline_link":
102
102
-
label = rmatch.group(1)
103
103
-
href = rmatch.group(2)
104
104
-
ntext.append(label)
101
101
+
label_bytes: bytes = rmatch.group(1)
102
102
+
href_bytes: bytes = rmatch.group(2)
105
103
106
106
-
delta = len(label) - (end - start)
104
104
+
ntext.extend(label_bytes)
105
105
+
106
106
+
delta = len(label_bytes) - (end - start)
107
107
offset += delta
108
108
109
109
-
nfragments.append(f.LinkFragment(start=nstart, end=nstart + len(label), url=href))
109
109
+
nend = nstart + len(label_bytes)
110
110
+
nfragments.append(
111
111
+
f.LinkFragment(
112
112
+
start=nstart, end=nend, url=href_bytes.decode("utf-8")
113
113
+
)
114
114
+
)
115
115
+
110
116
case "hashtag":
111
111
-
tag = rmatch.group(1)
112
112
-
ntext.append(markdown[start:end])
113
113
-
nfragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))
117
117
+
tag_bytes: bytes = rmatch.group(1)
118
118
+
ntext.extend(markdown_bytes[start:end])
119
119
+
nend = end + offset
120
120
+
nfragments.append(
121
121
+
f.TagFragment(
122
122
+
start=nstart, end=nend, tag=tag_bytes.decode("utf-8")
123
123
+
)
124
124
+
)
125
125
+
114
126
case "mention":
115
115
-
mention = rmatch.group(0)
116
116
-
ntext.append(markdown[start:end])
117
117
-
mention = mention[1:] if mention.startswith("@") else mention
118
118
-
nfragments.append(f.MentionFragment(start=nstart, end=nend, uri=mention))
127
127
+
mention_bytes: bytes = rmatch.group(0)
128
128
+
ntext.extend(markdown_bytes[start:end])
129
129
+
130
130
+
mention_str = mention_bytes.decode("utf-8")
131
131
+
mention_str = (
132
132
+
mention_str[1:] if mention_str.startswith("@") else mention_str
133
133
+
)
134
134
+
135
135
+
nend = end + offset
136
136
+
nfragments.append(
137
137
+
f.MentionFragment(start=nstart, end=nend, uri=mention_str)
138
138
+
)
139
139
+
119
140
case "url":
120
120
-
url = rmatch.group(0)
121
121
-
ntext.append(markdown[start:end])
122
122
-
nfragments.append(f.LinkFragment(start=nstart, end=nend, url=url))
141
141
+
url_bytes: bytes = rmatch.group(0)
142
142
+
ntext.extend(markdown_bytes[start:end])
143
143
+
nend = end + offset
144
144
+
nfragments.append(
145
145
+
f.LinkFragment(
146
146
+
start=nstart, end=nend, url=url_bytes.decode("utf-8")
147
147
+
)
148
148
+
)
149
149
+
123
150
case _:
124
151
pass
125
152
last_index = end
126
126
-
ntext.append(markdown[last_index:])
153
153
+
154
154
+
ntext.extend(markdown_bytes[last_index:])
127
155
128
128
-
return ''.join(ntext), nfragments
156
156
+
return ntext.decode("utf-8"), nfragments
+36
-32
util/splitter.py
reviewed
···
21
21
self.urllen: int = urllen
22
22
23
23
def normalize_link(self, label: str, url: str) -> str:
24
24
-
#if canonical_label(label, url):
25
25
-
# if self.urltrunc == "dotted":
26
26
-
# nlabel = url.split("://", 1)[1]
27
27
-
# if len(nlabel) <= self.urllen:
28
28
-
# return nlabel
29
29
-
# return nlabel[: self.urllen - 1] + "…"
30
24
return label
25
25
+
26
26
+
def tally_lenght(self, post: tuple[str, list[Fragment]]):
27
27
+
return grapheme.length(post[0])
31
28
32
29
def url_normalize(
33
33
-
self, text: str, fragments: list[Fragment]
34
34
-
) -> tuple[str, list[Fragment]]:
35
35
-
if self.urllen == -1:
36
36
-
return text, fragments
30
30
+
self, text: str, fragments: list[Fragment]
31
31
+
) -> tuple[str, list[Fragment]]:
32
32
+
if self.urllen == -1:
33
33
+
return text, fragments
34
34
+
btext = text.encode('utf-8')
37
35
38
38
-
ntext: list[str] = []
39
39
-
nfragments: list[Fragment] = []
36
36
+
nbytes = bytearray()
37
37
+
nfragments: list[Fragment] = []
40
38
41
41
-
offset: int = 0
42
42
-
last_index: int = 0
39
39
+
fragments = [fg for fg in fragments]
40
40
+
fragments.sort(key=lambda x: x.start)
43
41
44
44
-
fragments = [fg for fg in fragments]
45
45
-
fragments.sort(key=lambda x: x.start)
42
42
+
last_index = 0
46
43
47
47
-
for fg in fragments:
48
48
-
ntext.append(text[last_index:fg.start])
49
49
-
label = text[fg.start:fg.end]
50
50
-
nlabel = label
51
51
-
if isinstance(fg, LinkFragment):
52
52
-
nlabel = self.normalize_link(nlabel, fg.url)
53
53
-
ntext.append(nlabel)
44
44
+
for fg in fragments:
45
45
+
if last_index < fg.start:
46
46
+
nbytes.extend(btext[last_index:fg.start])
54
47
55
55
-
nfg = replace(fg, start=fg.start + offset)
56
56
-
change = len(nlabel) - len(label)
57
57
-
offset += change
58
58
-
nfg = replace(nfg, end=fg.end + offset)
48
48
+
label_bytes = btext[fg.start:fg.end]
49
49
+
label = label_bytes.decode('utf-8')
59
50
60
60
-
nfragments.append(nfg)
61
61
-
last_index = fg.end
51
51
+
nlabel = label
52
52
+
if isinstance(fg, LinkFragment):
53
53
+
nlabel = self.normalize_link(nlabel, fg.url)
62
54
63
63
-
ntext.append(text[last_index:])
55
55
+
nlabel_bytes = nlabel.encode('utf-8')
64
56
65
65
-
return ''.join(ntext), nfragments
57
57
+
nstart = len(nbytes)
58
58
+
nbytes.extend(nlabel_bytes)
59
59
+
nend = len(nbytes)
60
60
+
61
61
+
nfg = replace(fg, start=nstart, end=nend)
62
62
+
nfragments.append(nfg)
63
63
+
64
64
+
last_index = fg.end
65
65
+
66
66
+
if last_index < len(btext):
67
67
+
nbytes.extend(btext[last_index:])
68
68
+
69
69
+
return nbytes.decode('utf-8'), nfragments
66
70
67
71
def split(
68
72
self, text: str, fragments: list[Fragment]
69
73
) -> list[tuple[str, list[Fragment]]]:
70
74
text, fragments = self.url_normalize(text, fragments)
71
71
-
if grapheme.length(text) <= self.climit:
75
75
+
if self.tally_lenght((text, fragments)) <= self.climit:
72
76
return [(text, fragments)]