tangled
alpha
login
or
join now
zenfyr.dev
/
xpost
2
fork
atom
social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
2
fork
atom
overview
issues
1
pulls
pipelines
switch from fragments to tokens
zenfyr.dev
3 months ago
2a4fcb3c
98395f82
verified
This commit was signed with the committer's
known signature
.
zenfyr.dev
SSH Key Fingerprint:
SHA256:TtcIcnTnoAB5mqHofsaOxIgiMzfVBxej1AXT7DQdrTE=
+408
-551
14 changed files
expand all
collapse all
unified
split
bluesky
facets.py
input.py
tokens.py
cross
fragments.py
post.py
tokens.py
mastodon
input.py
parser.py
misskey
input.py
tests
util
html_test.py
markdown_test.py
util
html.py
markdown.py
splitter.py
-87
bluesky/facets.py
···
1
1
-
from typing import Any, override
2
2
-
import cross.fragments as f
3
3
-
from util.splitter import FragmentSplitter, canonical_label
4
4
-
5
5
-
LINK = 'app.bsky.richtext.facet#link'
6
6
-
TAG = 'app.bsky.richtext.facet#tag'
7
7
-
MENTION = "app.bsky.richtext.facet#mention"
8
8
-
9
9
-
class BskySplitter(FragmentSplitter):
10
10
-
def __init__(self):
11
11
-
super().__init__(300, 30)
12
12
-
13
13
-
@override
14
14
-
def normalize_link(self, label: str, url: str) -> str:
15
15
-
if canonical_label(label, url):
16
16
-
nlabel = url.split("://", 1)[1]
17
17
-
if len(nlabel) <= self.urllen:
18
18
-
return nlabel
19
19
-
return nlabel[: self.urllen - 1] + "…"
20
20
-
return label
21
21
-
22
22
-
# TODO handle extending overlapping fragments somehow
23
23
-
def parse_facets(
24
24
-
text: str,
25
25
-
facets: list[dict[str, Any]] | None
26
26
-
) -> tuple[str, list[f.Fragment]]:
27
27
-
if not facets:
28
28
-
return text, []
29
29
-
30
30
-
btext = text.encode("utf-8")
31
31
-
nbytes = bytearray()
32
32
-
last_original_byte_index = 0
33
33
-
fragments: list[f.Fragment] = []
34
34
-
35
35
-
for facet in facets:
36
36
-
original_start: int = facet['index']['byteStart']
37
37
-
original_end: int = facet['index']['byteEnd']
38
38
-
39
39
-
if last_original_byte_index < original_start:
40
40
-
nbytes.extend(btext[last_original_byte_index:original_start])
41
41
-
42
42
-
fdict = {feat['$type']: feat for feat in facet.get('features', [])}
43
43
-
44
44
-
original_label_bytes = btext[original_start:original_end]
45
45
-
original_label_str = original_label_bytes.decode("utf-8")
46
46
-
47
47
-
nlabel_bytes = original_label_bytes
48
48
-
49
49
-
if LINK in fdict:
50
50
-
url: str = fdict.pop(LINK)['uri']
51
51
-
label = original_label_str
52
52
-
53
53
-
split = url.split("://", 1)
54
54
-
full_url = False
55
55
-
if len(split) > 1:
56
56
-
if split[1].startswith(label):
57
57
-
full_url = True
58
58
-
if label.endswith("...") and split[1].startswith(label[:-3]):
59
59
-
full_url = True
60
60
-
61
61
-
if full_url:
62
62
-
nlabel_bytes = url.encode("utf-8")
63
63
-
64
64
-
nstart = len(nbytes)
65
65
-
nbytes.extend(nlabel_bytes)
66
66
-
nend = len(nbytes)
67
67
-
68
68
-
fragments.append(f.LinkFragment(start=nstart, end=nend, url=url))
69
69
-
else:
70
70
-
nstart = len(nbytes)
71
71
-
nbytes.extend(nlabel_bytes)
72
72
-
nend = len(nbytes)
73
73
-
74
74
-
if TAG in fdict:
75
75
-
tag: str = fdict.pop(TAG)['tag']
76
76
-
fragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))
77
77
-
78
78
-
if MENTION in fdict:
79
79
-
did: str = fdict.pop(MENTION)['did']
80
80
-
fragments.append(f.MentionFragment(start=nstart, end=nend, uri=did))
81
81
-
82
82
-
last_original_byte_index = original_end
83
83
-
84
84
-
if last_original_byte_index < len(btext):
85
85
-
nbytes.extend(btext[last_original_byte_index:])
86
86
-
87
87
-
return nbytes.decode("utf-8"), fragments
+3
-4
bluesky/input.py
···
8
8
import websockets
9
9
10
10
from atproto.util import AtUri
11
11
-
from bluesky.facets import parse_facets
11
11
+
from bluesky.tokens import tokenize_post
12
12
from bluesky.info import SERVICE, BlueskyService, validate_and_transform
13
13
from cross.attachments import (
14
14
LabelsAttachment,
···
76
76
)
77
77
return
78
78
79
79
-
text, fragments = parse_facets(record["text"], record.get('facets'))
80
80
-
post = Post(id=post_uri, parent_id=parent_uri, text=text)
81
81
-
post.fragments.extend(fragments)
79
79
+
tokens = tokenize_post(record["text"], record.get('facets', {}))
80
80
+
post = Post(id=post_uri, parent_id=parent_uri, tokens=tokens)
82
81
83
82
did, _, rid = AtUri.record_uri(post_uri)
84
83
post.attachments.put(
+95
bluesky/tokens.py
···
1
1
+
from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token
2
2
+
3
3
+
4
4
+
def tokenize_post(text: str, facets: list[dict]) -> list[Token]:
5
5
+
def decode(ut8: bytes) -> str:
6
6
+
return ut8.decode(encoding="utf-8")
7
7
+
8
8
+
if not text:
9
9
+
return []
10
10
+
ut8_text = text.encode(encoding="utf-8")
11
11
+
if not facets:
12
12
+
return [TextToken(text=decode(ut8_text))]
13
13
+
14
14
+
slices: list[tuple[int, int, str, str]] = []
15
15
+
16
16
+
for facet in facets:
17
17
+
features: list[dict] = facet.get("features", [])
18
18
+
if not features:
19
19
+
continue
20
20
+
21
21
+
# we don't support overlapping facets/features
22
22
+
feature = features[0]
23
23
+
feature_type = feature["$type"]
24
24
+
index = facet["index"]
25
25
+
match feature_type:
26
26
+
case "app.bsky.richtext.facet#tag":
27
27
+
slices.append(
28
28
+
(index["byteStart"], index["byteEnd"], "tag", feature["tag"])
29
29
+
)
30
30
+
case "app.bsky.richtext.facet#link":
31
31
+
slices.append(
32
32
+
(index["byteStart"], index["byteEnd"], "link", feature["uri"])
33
33
+
)
34
34
+
case "app.bsky.richtext.facet#mention":
35
35
+
slices.append(
36
36
+
(index["byteStart"], index["byteEnd"], "mention", feature["did"])
37
37
+
)
38
38
+
39
39
+
if not slices:
40
40
+
return [TextToken(text=decode(ut8_text))]
41
41
+
42
42
+
slices.sort(key=lambda s: s[0])
43
43
+
unique: list[tuple[int, int, str, str]] = []
44
44
+
current_end = 0
45
45
+
for start, end, ttype, val in slices:
46
46
+
if start >= current_end:
47
47
+
unique.append((start, end, ttype, val))
48
48
+
current_end = end
49
49
+
50
50
+
if not unique:
51
51
+
return [TextToken(text=decode(ut8_text))]
52
52
+
53
53
+
tokens: list[Token] = []
54
54
+
prev = 0
55
55
+
56
56
+
for start, end, ttype, val in unique:
57
57
+
if start > prev:
58
58
+
# text between facets
59
59
+
tokens.append(TextToken(text=decode(ut8_text[prev:start])))
60
60
+
# facet token
61
61
+
match ttype:
62
62
+
case "link":
63
63
+
label = decode(ut8_text[start:end])
64
64
+
65
65
+
# try to unflatten links
66
66
+
split = val.split("://", 1)
67
67
+
if len(split) > 1:
68
68
+
if split[1].startswith(label):
69
69
+
tokens.append(LinkToken(href=val))
70
70
+
prev = end
71
71
+
continue
72
72
+
73
73
+
if label.endswith("...") and split[1].startswith(label[:-3]):
74
74
+
tokens.append(LinkToken(href=val))
75
75
+
prev = end
76
76
+
continue
77
77
+
78
78
+
tokens.append(LinkToken(href=val, label=label))
79
79
+
case "tag":
80
80
+
tag = decode(ut8_text[start:end])
81
81
+
tokens.append(TagToken(tag=tag[1:] if tag.startswith("#") else tag))
82
82
+
case "mention":
83
83
+
mention = decode(ut8_text[start:end])
84
84
+
tokens.append(
85
85
+
MentionToken(
86
86
+
username=mention[1:] if mention.startswith("@") else mention,
87
87
+
uri=val,
88
88
+
)
89
89
+
)
90
90
+
prev = end
91
91
+
92
92
+
if prev < len(ut8_text):
93
93
+
tokens.append(TextToken(text=decode(ut8_text[prev:])))
94
94
+
95
95
+
return tokens
-25
cross/fragments.py
···
1
1
-
from dataclasses import dataclass
2
2
-
3
3
-
4
4
-
@dataclass(kw_only=True)
5
5
-
class Fragment:
6
6
-
start: int
7
7
-
end: int
8
8
-
9
9
-
10
10
-
@dataclass(kw_only=True)
11
11
-
class LinkFragment(Fragment):
12
12
-
url: str
13
13
-
14
14
-
15
15
-
@dataclass(kw_only=True)
16
16
-
class TagFragment(Fragment):
17
17
-
tag: str
18
18
-
19
19
-
20
20
-
@dataclass(kw_only=True)
21
21
-
class MentionFragment(Fragment):
22
22
-
uri: str
23
23
-
24
24
-
25
25
-
NON_OVERLAPPING: set[type[Fragment]] = {LinkFragment, TagFragment, MentionFragment}
+2
-3
cross/post.py
···
2
2
from typing import TypeVar
3
3
4
4
from cross.attachments import Attachment
5
5
-
from cross.fragments import Fragment
5
5
+
from cross.tokens import Token
6
6
7
7
T = TypeVar("T", bound=Attachment)
8
8
···
30
30
class Post:
31
31
id: str
32
32
parent_id: str | None
33
33
-
text: str # utf-8 text
33
33
+
tokens: list[Token]
34
34
attachments: AttachmentKeeper = field(default_factory=AttachmentKeeper)
35
35
-
fragments: list[Fragment] = field(default_factory=list)
+23
cross/tokens.py
···
1
1
+
from dataclasses import dataclass
2
2
+
3
3
+
@dataclass(kw_only=True)
4
4
+
class Token:
5
5
+
pass
6
6
+
7
7
+
@dataclass(kw_only=True)
8
8
+
class TextToken(Token):
9
9
+
text: str
10
10
+
11
11
+
@dataclass(kw_only=True)
12
12
+
class LinkToken(Token):
13
13
+
href: str
14
14
+
label: str | None = None
15
15
+
16
16
+
@dataclass(kw_only=True)
17
17
+
class TagToken(Token):
18
18
+
tag: str
19
19
+
20
20
+
@dataclass(kw_only=True)
21
21
+
class MentionToken(Token):
22
22
+
username: str
23
23
+
uri: str | None = None
+3
-4
mastodon/input.py
···
109
109
"Skipping %s, parent %s not found in db", status["id"], in_reply
110
110
)
111
111
return
112
112
-
parser = StatusParser()
112
112
+
parser = StatusParser(status)
113
113
parser.feed(status["content"])
114
114
-
text, fragments = parser.get_result()
114
114
+
tokens = parser.get_result()
115
115
116
116
-
post = Post(id=status["id"], parent_id=in_reply, text=text)
117
117
-
post.fragments.extend(fragments)
116
116
+
post = Post(id=status["id"], parent_id=in_reply, tokens=tokens)
118
117
119
118
if quote:
120
119
post.attachments.put(QuoteAttachment(quoted_id=quote['id'], quoted_user=self.user_id))
+14
-22
mastodon/parser.py
···
1
1
from typing import Any, override
2
2
-
import cross.fragments as f
3
3
-
from util.html import HTMLToFragmentsParser
2
2
+
3
3
+
from cross.tokens import LinkToken, MentionToken, TagToken
4
4
+
from util.html import HTMLToTokensParser
4
5
5
6
6
6
-
class StatusParser(HTMLToFragmentsParser):
7
7
+
class StatusParser(HTMLToTokensParser):
7
8
def __init__(self, status: dict[str, Any]) -> None:
8
9
super().__init__()
9
10
self.tags: set[str] = set(tag["url"] for tag in status.get("tags", []))
···
11
12
12
13
@override
13
14
def handle_a_endtag(self):
14
14
-
current_end = len(self.builder)
15
15
-
start, _attr = self._tag_stack.pop("a")
15
15
+
label, _attr = self._tag_stack.pop("a")
16
16
17
17
href = _attr.get("href")
18
18
-
if href and current_end > start:
18
18
+
if href:
19
19
cls = _attr.get("class", "")
20
20
if cls:
21
21
if "hashtag" in cls and href in self.tags:
22
22
-
tag = self.builder[start:current_end]
23
23
-
tag = tag[1:] if tag.startswith(b"#") else tag
22
22
+
tag = label[1:] if label.startswith("#") else label
24
23
25
25
-
self.fragments.append(
26
26
-
f.TagFragment(
27
27
-
start=start, end=current_end, tag=tag.decode("utf-8")
28
28
-
)
29
29
-
)
24
24
+
self.tokens.append(TagToken(tag=tag))
30
25
return
31
31
-
if "mention" in cls:
32
32
-
if href in self.mentions:
33
33
-
self.fragments.append(
34
34
-
f.MentionFragment(start=start, end=current_end, uri=href)
35
35
-
)
36
36
-
return
37
37
-
self.fragments.append(
38
38
-
f.LinkFragment(start=start, end=current_end, url=href)
39
39
-
)
26
26
+
if "mention" in cls and href in self.mentions:
27
27
+
username = label[1:] if label.startswith("@") else label
28
28
+
29
29
+
self.tokens.append(MentionToken(username=username, uri=href))
30
30
+
return
31
31
+
self.tokens.append(LinkToken(href=href, label=label))
+9
-3
misskey/input.py
···
104
104
)
105
105
return
106
106
107
107
+
mention_handles: dict = note.get("mentionHandles") or {}
108
108
+
tags: list[str] = note.get("tags") or []
109
109
+
110
110
+
handles: list[tuple[str, str]] = []
111
111
+
for key, value in mention_handles.items():
112
112
+
handles.append((value, value))
113
113
+
107
114
parser = MarkdownParser() # TODO MFM parser
108
108
-
text, fragments = parser.parse(note.get("text", ""))
109
109
-
post = Post(id=note["id"], parent_id=reply["id"] if reply else None, text=text)
110
110
-
post.fragments.extend(fragments)
115
115
+
tokens = parser.parse(note.get("text", ""), tags, handles)
116
116
+
post = Post(id=note["id"], parent_id=reply["id"] if reply else None, tokens=tokens)
111
117
112
118
post.attachments.put(RemoteUrlAttachment(url=self.url + "/notes/" + note["id"]))
113
119
if renote:
-32
tests/util/html_test.py
···
1
1
-
import html
2
2
-
from util.html import HTMLToFragmentsParser
3
3
-
import cross.fragments as f
4
4
-
import pytest
5
5
-
6
6
-
7
7
-
@pytest.fixture()
8
8
-
def parser():
9
9
-
return HTMLToFragmentsParser()
10
10
-
11
11
-
12
12
-
def test_html(parser: HTMLToFragmentsParser):
13
13
-
input = '<p><del>excuse</del> <em>me</em>, <strong>test</strong> post</p><blockquote><p>very testy <a href="https://google.com" target="_blank" rel="nofollow noopener">post</a></p></blockquote><pre><code>cat << food<br></code></pre>'
14
14
-
parser.feed(input)
15
15
-
text, frgs = parser.get_result()
16
16
-
17
17
-
excepted = "~~excuse~~ *me*, **test** post\n\n> very testy post\n\n\n```\ncat << food\n```"
18
18
-
assert text == excepted
19
19
-
assert len(frgs) == 1
20
20
-
21
21
-
assert isinstance(frgs[0], f.LinkFragment)
22
22
-
assert frgs[0].start == 46 and frgs[0].end == 50
23
23
-
assert frgs[0].url == "https://google.com"
24
24
-
25
25
-
26
26
-
def test_keep_autolink(parser: HTMLToFragmentsParser):
27
27
-
input = "<https://google.com>"
28
28
-
parser.feed(input)
29
29
-
text, frgs = parser.get_result()
30
30
-
31
31
-
# TODO
32
32
-
# assert text == input
-160
tests/util/markdown_test.py
···
1
1
-
from util.markdown import MarkdownParser
2
2
-
import cross.fragments as f
3
3
-
import pytest
4
4
-
5
5
-
EMOJI = "🤬🤬"
6
6
-
7
7
-
8
8
-
@pytest.fixture()
9
9
-
def parser():
10
10
-
return MarkdownParser()
11
11
-
12
12
-
13
13
-
def test_empty(parser: MarkdownParser):
14
14
-
text, frgs = parser.parse("")
15
15
-
assert text == ""
16
16
-
assert frgs == []
17
17
-
18
18
-
19
19
-
def test_no_formatting(parser: MarkdownParser):
20
20
-
text, frgs = parser.parse("text no formatting!")
21
21
-
assert text == "text no formatting!"
22
22
-
assert frgs == []
23
23
-
24
24
-
25
25
-
def test_link(parser: MarkdownParser):
26
26
-
text, frgs = parser.parse("https://google.com")
27
27
-
assert text == "https://google.com"
28
28
-
assert len(frgs) == 1
29
29
-
30
30
-
assert isinstance(frgs[0], f.LinkFragment)
31
31
-
assert frgs[0].start == 0 and frgs[0].end == 18
32
32
-
assert frgs[0].url == "https://google.com"
33
33
-
34
34
-
35
35
-
def test_link_emojis(parser: MarkdownParser):
36
36
-
input = f"{EMOJI} https://google.com"
37
37
-
text, frgs = parser.parse(input)
38
38
-
assert text == input
39
39
-
assert len(frgs) == 1
40
40
-
41
41
-
assert isinstance(frgs[0], f.LinkFragment)
42
42
-
assert frgs[0].start == 9 and frgs[0].end == 27
43
43
-
assert frgs[0].url == "https://google.com"
44
44
-
45
45
-
46
46
-
def test_label_link(parser: MarkdownParser):
47
47
-
text, frgs = parser.parse("[hello](https://google.com)")
48
48
-
assert text == "hello"
49
49
-
assert len(frgs) == 1
50
50
-
51
51
-
assert isinstance(frgs[0], f.LinkFragment)
52
52
-
assert frgs[0].start == 0 and frgs[0].end == 5
53
53
-
assert frgs[0].url == "https://google.com"
54
54
-
55
55
-
56
56
-
def test_label_link_emojis(parser: MarkdownParser):
57
57
-
input = f"[{EMOJI}]( https://google.com)"
58
58
-
text, frgs = parser.parse(input)
59
59
-
assert text == EMOJI
60
60
-
assert len(frgs) == 1
61
61
-
62
62
-
assert isinstance(frgs[0], f.LinkFragment)
63
63
-
assert frgs[0].start == 0 and frgs[0].end == 8
64
64
-
assert frgs[0].url == "https://google.com"
65
65
-
66
66
-
67
67
-
def test_tag(parser: MarkdownParser):
68
68
-
input = "#testing"
69
69
-
text, frgs = parser.parse(input)
70
70
-
assert text == input
71
71
-
assert len(frgs) == 1
72
72
-
73
73
-
assert isinstance(frgs[0], f.TagFragment)
74
74
-
assert frgs[0].start == 0 and frgs[0].end == 8
75
75
-
assert frgs[0].tag == "testing"
76
76
-
77
77
-
78
78
-
def test_tag_emojis(parser: MarkdownParser):
79
79
-
input = f"{EMOJI} #testing"
80
80
-
text, frgs = parser.parse(input)
81
81
-
assert text == input
82
82
-
assert len(frgs) == 1
83
83
-
84
84
-
assert isinstance(frgs[0], f.TagFragment)
85
85
-
assert frgs[0].start == 9 and frgs[0].end == 17
86
86
-
assert frgs[0].tag == "testing"
87
87
-
88
88
-
89
89
-
def test_mention(parser: MarkdownParser):
90
90
-
input = "@zen@merping.synth.download"
91
91
-
text, frgs = parser.parse(input)
92
92
-
assert text == input
93
93
-
assert len(frgs) == 1
94
94
-
95
95
-
assert isinstance(frgs[0], f.MentionFragment)
96
96
-
assert frgs[0].start == 0 and frgs[0].end == 27
97
97
-
assert frgs[0].uri == "zen@merping.synth.download"
98
98
-
99
99
-
100
100
-
def test_mention_emojis(parser: MarkdownParser):
101
101
-
input = f"{EMOJI} @zen@merping.synth.download"
102
102
-
text, frgs = parser.parse(input)
103
103
-
assert text == input
104
104
-
assert len(frgs) == 1
105
105
-
106
106
-
assert isinstance(frgs[0], f.MentionFragment)
107
107
-
assert frgs[0].start == 9 and frgs[0].end == 36
108
108
-
assert frgs[0].uri == "zen@merping.synth.download"
109
109
-
110
110
-
111
111
-
def test_mixed(parser: MarkdownParser):
112
112
-
input = "#testing_tag @zen@merping.synth.download [hello](https://zenfyr.dev/) hii! https://example.com"
113
113
-
text, frgs = parser.parse(input)
114
114
-
115
115
-
expected_text = (
116
116
-
"#testing_tag @zen@merping.synth.download hello hii! https://example.com"
117
117
-
)
118
118
-
assert text == expected_text
119
119
-
assert len(frgs) == 4
120
120
-
121
121
-
assert isinstance(frgs[0], f.TagFragment)
122
122
-
assert frgs[0].start == 0 and frgs[0].end == 12
123
123
-
assert frgs[0].tag == "testing_tag"
124
124
-
125
125
-
assert isinstance(frgs[1], f.MentionFragment)
126
126
-
assert frgs[1].start == 13 and frgs[1].end == 40
127
127
-
assert frgs[1].uri == "zen@merping.synth.download"
128
128
-
129
129
-
assert isinstance(frgs[2], f.LinkFragment)
130
130
-
assert frgs[2].start == 41 and frgs[2].end == 46
131
131
-
assert frgs[2].url == "https://zenfyr.dev/"
132
132
-
133
133
-
assert isinstance(frgs[3], f.LinkFragment)
134
134
-
assert frgs[3].start == 52 and frgs[3].end == 71
135
135
-
assert frgs[3].url == "https://example.com"
136
136
-
137
137
-
138
138
-
def test_mixed_html(parser: MarkdownParser):
139
139
-
input = f'<p>#testing_tag @zen@merping.synth.download</p> {EMOJI} <a href="https://zenfyr.dev/"><b>hello</b></a> hii! https://example.com'
140
140
-
text, frgs = parser.parse(input)
141
141
-
142
142
-
expected_text = f"#testing_tag @zen@merping.synth.download\n\n {EMOJI} **hello** hii! https://example.com"
143
143
-
assert text == expected_text
144
144
-
assert len(frgs) == 4
145
145
-
146
146
-
assert isinstance(frgs[0], f.TagFragment)
147
147
-
assert frgs[0].start == 0 and frgs[0].end == 12
148
148
-
assert frgs[0].tag == "testing_tag"
149
149
-
150
150
-
assert isinstance(frgs[1], f.MentionFragment)
151
151
-
assert frgs[1].start == 13 and frgs[1].end == 40
152
152
-
assert frgs[1].uri == "zen@merping.synth.download"
153
153
-
154
154
-
assert isinstance(frgs[2], f.LinkFragment)
155
155
-
assert frgs[2].start == 52 and frgs[2].end == 61
156
156
-
assert frgs[2].url == "https://zenfyr.dev/"
157
157
-
158
158
-
assert isinstance(frgs[3], f.LinkFragment)
159
159
-
assert frgs[3].start == 67 and frgs[3].end == 86
160
160
-
assert frgs[3].url == "https://example.com"
+78
-44
util/html.py
···
1
1
from html.parser import HTMLParser
2
2
from typing import override
3
3
-
import cross.fragments as f
4
3
5
5
-
class HTMLToFragmentsParser(HTMLParser):
4
4
+
from cross.tokens import LinkToken, TextToken, Token
5
5
+
from util.splitter import canonical_label
6
6
+
7
7
+
8
8
+
class HTMLToTokensParser(HTMLParser):
6
9
def __init__(self) -> None:
7
10
super().__init__()
8
8
-
self.builder: bytearray = bytearray()
9
9
-
self.fragments: list[f.Fragment] = []
11
11
+
self.tokens: list[Token] = []
10
12
11
11
-
self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}
13
13
+
self._tag_stack: dict[str, tuple[str, dict[str, str | None]]] = {}
12
14
self.in_pre: bool = False
13
15
self.in_code: bool = False
14
16
self.invisible: bool = False
15
17
16
18
def handle_a_endtag(self):
17
17
-
current_end = len(self.builder)
18
18
-
start, _attr = self._tag_stack.pop("a")
19
19
+
label, _attr = self._tag_stack.pop("a")
20
20
+
21
21
+
href = _attr.get("href")
22
22
+
if href:
23
23
+
if canonical_label(label, href):
24
24
+
self.tokens.append(LinkToken(href=href))
25
25
+
else:
26
26
+
self.tokens.append(LinkToken(href=href, label=label))
19
27
20
20
-
href = _attr.get('href')
21
21
-
if href and current_end > start:
22
22
-
self.fragments.append(
23
23
-
f.LinkFragment(start=start, end=current_end, url=href)
24
24
-
)
28
28
+
def append_text(self, text: str):
29
29
+
self.tokens.append(TextToken(text=text))
25
30
26
31
def append_newline(self):
27
27
-
if self.builder and not self.builder.endswith(b"\n"):
28
28
-
self.builder.extend(b"\n")
32
32
+
if self.tokens:
33
33
+
last_token = self.tokens[-1]
34
34
+
if isinstance(last_token, TextToken) and not last_token.text.endswith("\n"):
35
35
+
self.tokens.append(TextToken(text="\n"))
29
36
30
37
@override
31
38
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
···
36
43
37
44
match tag:
38
45
case "p":
39
39
-
cls = _attr.get('class', '')
40
40
-
if cls and 'quote-inline' in cls:
46
46
+
cls = _attr.get("class", "")
47
47
+
if cls and "quote-inline" in cls:
41
48
self.invisible = True
42
49
case "a":
43
43
-
self._tag_stack["a"] = (len(self.builder), _attr)
50
50
+
self._tag_stack["a"] = ("", _attr)
44
51
case "code":
45
52
if not self.in_pre:
46
46
-
self.builder.extend(b"`")
53
53
+
self.append_text("`")
47
54
self.in_code = True
48
55
case "pre":
49
56
self.append_newline()
50
50
-
self.builder.extend(b"```\n")
57
57
+
self.append_text("```\n")
51
58
self.in_pre = True
52
59
case "blockquote":
53
60
self.append_newline()
54
54
-
self.builder.extend(b"> ")
61
61
+
self.append_text("> ")
55
62
case "strong" | "b":
56
56
-
self.builder.extend(b"**")
63
63
+
self.append_text("**")
57
64
case "em" | "i":
58
58
-
self.builder.extend(b"*")
65
65
+
self.append_text("*")
59
66
case "del" | "s":
60
60
-
self.builder.extend(b"~~")
67
67
+
self.append_text("~~")
61
68
case "br":
62
62
-
self.builder.extend(b"\n")
69
69
+
self.append_text("\n")
63
70
case "h1" | "h2" | "h3" | "h4" | "h5" | "h6":
64
71
level = int(tag[1])
65
65
-
self.builder.extend(("\n" + "#" * level + " ").encode('utf-8'))
72
72
+
self.append_text("\n" + "#" * level + " ")
66
73
case _:
67
67
-
#self.builder.extend(f"<{tag}>".encode("utf-8"))
74
74
+
# self.builder.extend(f"<{tag}>".encode("utf-8"))
68
75
pass
69
69
-
70
76
71
77
@override
72
78
def handle_endtag(self, tag: str) -> None:
···
81
87
self.handle_a_endtag()
82
88
case "code":
83
89
if not self.in_pre and self.in_code:
84
84
-
self.builder.extend(b"`")
90
90
+
self.append_text("`")
85
91
self.in_code = False
86
92
case "pre":
87
93
self.append_newline()
88
88
-
self.builder.extend(b"```\n")
94
94
+
self.append_text("```\n")
89
95
self.in_pre = False
90
96
case "blockquote":
91
91
-
self.builder.extend(b"\n")
97
97
+
self.append_text("\n")
92
98
case "strong" | "b":
93
93
-
self.builder.extend(b"**")
99
99
+
self.append_text("**")
94
100
case "em" | "i":
95
95
-
self.builder.extend(b"*")
101
101
+
self.append_text("*")
96
102
case "del" | "s":
97
97
-
self.builder.extend(b"~~")
103
103
+
self.append_text("~~")
98
104
case "p":
99
99
-
self.builder.extend(b"\n\n")
105
105
+
self.append_text("\n\n")
100
106
case "h1" | "h2" | "h3" | "h4" | "h5" | "h6":
101
101
-
self.builder.extend(b'\n')
107
107
+
self.append_text("\n")
102
108
case _:
103
103
-
#self.builder.extend(f"</{tag}>".encode("utf-8"))
109
109
+
# self.builder.extend(f"</{tag}>".encode("utf-8"))
104
110
pass
105
111
106
112
@override
107
113
def handle_data(self, data: str) -> None:
108
108
-
if not self.invisible:
109
109
-
self.builder.extend(data.encode('utf-8'))
114
114
+
if self.invisible:
115
115
+
return
116
116
+
117
117
+
if self._tag_stack.get('a'):
118
118
+
label, _attr = self._tag_stack.pop("a")
119
119
+
self._tag_stack["a"] = (label + data, _attr)
120
120
+
return
121
121
+
122
122
+
def get_result(self) -> list[Token]:
123
123
+
if not self.tokens:
124
124
+
return []
125
125
+
126
126
+
combined: list[Token] = []
127
127
+
buffer: list[str] = []
128
128
+
129
129
+
def flush_buffer():
130
130
+
if buffer:
131
131
+
merged = "".join(buffer)
132
132
+
combined.append(TextToken(text=merged))
133
133
+
buffer.clear()
110
134
111
111
-
def get_result(self) -> tuple[str, list[f.Fragment]]:
112
112
-
if self.builder.endswith(b'\n\n'):
113
113
-
return self.builder[:-2].decode('utf-8'), self.fragments
114
114
-
if self.builder.endswith(b'\n'):
115
115
-
return self.builder[:-1].decode('utf-8'), self.fragments
116
116
-
return self.builder.decode('utf-8'), self.fragments
135
135
+
for token in self.tokens:
136
136
+
if isinstance(token, TextToken):
137
137
+
buffer.append(token.text)
138
138
+
else:
139
139
+
flush_buffer()
140
140
+
combined.append(token)
141
141
+
142
142
+
flush_buffer()
143
143
+
144
144
+
if combined and isinstance(combined[-1], TextToken):
145
145
+
if combined[-1].text.endswith("\n\n"):
146
146
+
combined[-1] = TextToken(text=combined[-1].text[:-2])
147
147
+
148
148
+
if combined[-1].text.endswith("\n"):
149
149
+
combined[-1] = TextToken(text=combined[-1].text[:-1])
150
150
+
return combined
+92
-122
util/markdown.py
···
1
1
-
from dataclasses import replace
2
1
import re
3
3
-
import cross.fragments as f
4
4
-
from util.html import HTMLToFragmentsParser
5
2
6
6
-
URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
3
3
+
from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token
4
4
+
from util.html import HTMLToTokensParser
5
5
+
from util.splitter import canonical_label
6
6
+
7
7
+
URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
7
8
MD_INLINE_LINK = re.compile(
8
8
-
rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
9
9
+
r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
9
10
re.IGNORECASE,
10
11
)
11
12
MD_AUTOLINK = re.compile(
12
12
-
rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
13
13
+
r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
13
14
)
14
14
-
HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)")
15
15
-
FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
15
15
+
HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
16
16
+
FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
16
17
17
18
REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
18
19
19
20
20
21
# TODO autolinks are broken by the html parser
21
22
class MarkdownParser:
22
22
-
def parse(self, text: str) -> tuple[str, list[f.Fragment]]:
23
23
+
def parse(
24
24
+
self, text: str, tags: list[str], handles: list[tuple[str, str]]
25
25
+
) -> list[Token]:
23
26
if not text:
24
24
-
return "", []
25
25
-
26
26
-
html_parser = HTMLToFragmentsParser()
27
27
-
html_parser.feed(text)
28
28
-
markdown, fragments = html_parser.get_result()
29
29
-
30
30
-
markdown_bytes: bytes = markdown.encode("utf-8")
27
27
+
return []
31
28
32
32
-
index: int = 0
33
33
-
total: int = len(markdown_bytes)
29
29
+
tokenizer = HTMLToTokensParser()
30
30
+
tokenizer.feed(text)
31
31
+
html_tokens = tokenizer.get_result()
34
32
35
35
-
events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = []
36
36
-
events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
33
33
+
tokens: list[Token] = []
37
34
38
38
-
while index < total:
39
39
-
ch: int = markdown_bytes[index]
40
40
-
rmatch: re.Match[bytes] | None = None
41
41
-
kind = None
42
42
-
43
43
-
if ch == b"["[0]:
44
44
-
rmatch = MD_INLINE_LINK.match(markdown_bytes, index)
45
45
-
kind = "inline_link"
46
46
-
# elif ch == b"<"[0]:
47
47
-
# rmatch = MD_AUTOLINK.match(markdown_bytes, index)
48
48
-
# kind = "autolink"
49
49
-
elif ch == b"#"[0]:
50
50
-
rmatch = HASHTAG.match(markdown_bytes, index)
51
51
-
kind = "hashtag"
52
52
-
elif ch == b"@"[0]:
53
53
-
rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index)
54
54
-
kind = "mention"
55
55
-
else:
56
56
-
rmatch = URL.match(markdown_bytes, index)
57
57
-
kind = "url"
58
58
-
59
59
-
if rmatch:
60
60
-
start, end = rmatch.start(), rmatch.end()
61
61
-
if end == index:
62
62
-
index += 1
35
35
+
for tk in html_tokens:
36
36
+
if isinstance(tk, TextToken):
37
37
+
tokens.extend(self.__tokenize_md(tk.text, tags, handles))
38
38
+
elif isinstance(tk, LinkToken):
39
39
+
if not tk.label or canonical_label(tk.label, tk.href):
40
40
+
tokens.append(tk)
63
41
continue
64
64
-
events.append((start, end, rmatch, kind))
65
65
-
index = end
66
66
-
continue
67
42
68
68
-
index += 1
69
69
-
70
70
-
events.sort(key=lambda x: x[0])
71
71
-
72
72
-
last_end: int = 0
73
73
-
for start, end, _, _ in events:
74
74
-
if start > end:
75
75
-
raise Exception(f"Invalid fragment position start={start}, end={end}")
76
76
-
if last_end > start:
77
77
-
raise Exception(
78
78
-
f"Overlapping text fragments at position end={last_end}, start={start}"
43
43
+
tokens.extend(
44
44
+
self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)
79
45
)
80
80
-
last_end = end
81
81
-
82
82
-
ntext: bytearray = bytearray()
83
83
-
nfragments: list[f.Fragment] = []
84
84
-
85
85
-
offset: int = 0
86
86
-
last_index: int = 0
87
87
-
88
88
-
for start, end, rmatch, event in events:
89
89
-
ntext.extend(markdown_bytes[last_index:start])
46
46
+
else:
47
47
+
tokens.append(tk)
90
48
91
91
-
if isinstance(rmatch, f.Fragment):
92
92
-
ntext.extend(markdown_bytes[start:end])
93
93
-
nfg = replace(rmatch, start=start + offset, end=end + offset)
94
94
-
nfragments.append(nfg)
95
95
-
last_index = end
96
96
-
continue
49
49
+
return tokens
97
50
98
98
-
nstart = start + offset
99
99
-
match event:
100
100
-
case "inline_link":
101
101
-
label_bytes: bytes = rmatch.group(1)
102
102
-
href_bytes: bytes = rmatch.group(2)
51
51
+
def __tokenize_md(
52
52
+
self, text: str, tags: list[str], handles: list[tuple[str, str]]
53
53
+
) -> list[Token]:
54
54
+
index: int = 0
55
55
+
total: int = len(text)
56
56
+
buffer: list[str] = []
103
57
104
104
-
ntext.extend(label_bytes)
58
58
+
tokens: list[Token] = []
105
59
106
106
-
delta = len(label_bytes) - (end - start)
107
107
-
offset += delta
60
60
+
def flush():
61
61
+
nonlocal buffer
62
62
+
if buffer:
63
63
+
tokens.append(TextToken(text="".join(buffer)))
64
64
+
buffer = []
108
65
109
109
-
nend = nstart + len(label_bytes)
110
110
-
nfragments.append(
111
111
-
f.LinkFragment(
112
112
-
start=nstart, end=nend, url=href_bytes.decode("utf-8")
113
113
-
)
114
114
-
)
66
66
+
while index < total:
67
67
+
if text[index] == "[":
68
68
+
md_inline = MD_INLINE_LINK.match(text, index)
69
69
+
if md_inline:
70
70
+
flush()
71
71
+
label = md_inline.group(1)
72
72
+
href = md_inline.group(2)
73
73
+
tokens.append(LinkToken(href=href, label=label))
74
74
+
index = md_inline.end()
75
75
+
continue
115
76
116
116
-
case "hashtag":
117
117
-
tag_bytes: bytes = rmatch.group(1)
118
118
-
ntext.extend(markdown_bytes[start:end])
119
119
-
nend = end + offset
120
120
-
nfragments.append(
121
121
-
f.TagFragment(
122
122
-
start=nstart, end=nend, tag=tag_bytes.decode("utf-8")
123
123
-
)
124
124
-
)
77
77
+
if text[index] == "<":
78
78
+
md_auto = MD_AUTOLINK.match(text, index)
79
79
+
if md_auto:
80
80
+
flush()
81
81
+
href = md_auto.group(1)
82
82
+
tokens.append(LinkToken(href=href, label=None))
83
83
+
index = md_auto.end()
84
84
+
continue
125
85
126
126
-
case "mention":
127
127
-
mention_bytes: bytes = rmatch.group(0)
128
128
-
ntext.extend(markdown_bytes[start:end])
86
86
+
if text[index] == "#":
87
87
+
tag = HASHTAG.match(text, index)
88
88
+
if tag:
89
89
+
tag_text = tag.group(1)
90
90
+
if tag_text.lower() in tags:
91
91
+
flush()
92
92
+
tokens.append(TagToken(tag=tag_text))
93
93
+
index = tag.end()
94
94
+
continue
129
95
130
130
-
mention_str = mention_bytes.decode("utf-8")
131
131
-
mention_str = (
132
132
-
mention_str[1:] if mention_str.startswith("@") else mention_str
133
133
-
)
96
96
+
if text[index] == "@":
97
97
+
handle = FEDIVERSE_HANDLE.match(text, index)
98
98
+
if handle:
99
99
+
handle_text = handle.group(0)
100
100
+
stripped_handle = handle_text.strip()
134
101
135
135
-
nend = end + offset
136
136
-
nfragments.append(
137
137
-
f.MentionFragment(start=nstart, end=nend, uri=mention_str)
102
102
+
match = next(
103
103
+
(pair for pair in handles if stripped_handle in pair), None
138
104
)
139
105
140
140
-
case "url":
141
141
-
url_bytes: bytes = rmatch.group(0)
142
142
-
ntext.extend(markdown_bytes[start:end])
143
143
-
nend = end + offset
144
144
-
nfragments.append(
145
145
-
f.LinkFragment(
146
146
-
start=nstart, end=nend, url=url_bytes.decode("utf-8")
147
147
-
)
148
148
-
)
106
106
+
if match:
107
107
+
flush()
108
108
+
tokens.append(
109
109
+
MentionToken(username=match[1], uri=None)
110
110
+
) # TODO: misskey doesn’t provide a uri
111
111
+
index = handle.end()
112
112
+
continue
149
113
150
150
-
case _:
151
151
-
pass
152
152
-
last_index = end
114
114
+
url = URL.match(text, index)
115
115
+
if url:
116
116
+
flush()
117
117
+
href = url.group(0)
118
118
+
tokens.append(LinkToken(href=href, label=None))
119
119
+
index = url.end()
120
120
+
continue
153
121
154
154
-
ntext.extend(markdown_bytes[last_index:])
122
122
+
buffer.append(text[index])
123
123
+
index += 1
155
124
156
156
-
return ntext.decode("utf-8"), nfragments
125
125
+
flush()
126
126
+
return tokens
+89
-45
util/splitter.py
···
1
1
+
import re
2
2
+
from dataclasses import replace
3
3
+
1
4
import grapheme
2
2
-
from cross.fragments import Fragment, LinkFragment
3
3
-
from dataclasses import replace
5
5
+
6
6
+
from cross.tokens import LinkToken, TagToken, TextToken, Token
4
7
5
8
6
9
def canonical_label(label: str | None, href: str):
···
15
18
return False
16
19
17
20
18
18
-
class FragmentSplitter:
19
19
-
def __init__(self, climit: int, urllen: int):
20
20
-
self.climit: int = climit
21
21
-
self.urllen: int = urllen
21
21
+
ALTERNATE = re.compile(r"\S+|\s+")
22
22
23
23
-
def normalize_link(self, label: str, url: str) -> str:
24
24
-
return label
25
23
26
26
-
def tally_lenght(self, post: tuple[str, list[Fragment]]):
27
27
-
return grapheme.length(post[0])
24
24
+
def split_tokens(
25
25
+
tokens: list[Token],
26
26
+
max_chars: int,
27
27
+
max_link_len: int = 35,
28
28
+
) -> list[list[Token]]:
29
29
+
def new_block() -> None:
30
30
+
nonlocal blocks, block, length
31
31
+
if block:
32
32
+
blocks.append(block)
33
33
+
block, length = [], 0
28
34
29
29
-
def url_normalize(
30
30
-
self, text: str, fragments: list[Fragment]
31
31
-
) -> tuple[str, list[Fragment]]:
32
32
-
if self.urllen == -1:
33
33
-
return text, fragments
34
34
-
btext = text.encode('utf-8')
35
35
+
def append_text(text: str) -> None:
36
36
+
nonlocal block
37
37
+
if block and isinstance(block[-1], TextToken):
38
38
+
block[-1] = replace(block[-1], text=block[-1].text + text)
39
39
+
else:
40
40
+
block.append(TextToken(text=text))
35
41
36
36
-
nbytes = bytearray()
37
37
-
nfragments: list[Fragment] = []
42
42
+
blocks: list[list[Token]] = []
43
43
+
block: list[Token] = []
44
44
+
length: int = 0
38
45
39
39
-
fragments = [fg for fg in fragments]
40
40
-
fragments.sort(key=lambda x: x.start)
46
46
+
for tk in tokens:
47
47
+
if isinstance(tk, TagToken):
48
48
+
tag_len = 1 + grapheme.length(tk.tag)
49
49
+
if length + tag_len > max_chars:
50
50
+
new_block()
51
51
+
block.append(tk)
52
52
+
length += tag_len
53
53
+
continue
54
54
+
if isinstance(tk, LinkToken):
55
55
+
label_text = tk.label or ""
56
56
+
link_len = grapheme.length(label_text)
41
57
42
42
-
last_index = 0
58
58
+
if canonical_label(tk.label, tk.href):
59
59
+
link_len = min(link_len, max_link_len)
43
60
44
44
-
for fg in fragments:
45
45
-
if last_index < fg.start:
46
46
-
nbytes.extend(btext[last_index:fg.start])
61
61
+
if length + link_len <= max_chars:
62
62
+
block.append(tk)
63
63
+
length += link_len
64
64
+
continue
47
65
48
48
-
label_bytes = btext[fg.start:fg.end]
49
49
-
label = label_bytes.decode('utf-8')
66
66
+
if length:
67
67
+
new_block()
50
68
51
51
-
nlabel = label
52
52
-
if isinstance(fg, LinkFragment):
53
53
-
nlabel = self.normalize_link(nlabel, fg.url)
69
69
+
remaining = label_text
70
70
+
while remaining:
71
71
+
room = (
72
72
+
max_chars
73
73
+
- length
74
74
+
- (0 if grapheme.length(remaining) <= max_chars else 1)
75
75
+
)
76
76
+
chunk = grapheme.slice(remaining, 0, room)
77
77
+
if grapheme.length(remaining) > room:
78
78
+
chunk += "-"
54
79
55
55
-
nlabel_bytes = nlabel.encode('utf-8')
80
80
+
block.append(replace(tk, label=chunk))
81
81
+
length += grapheme.length(chunk)
56
82
57
57
-
nstart = len(nbytes)
58
58
-
nbytes.extend(nlabel_bytes)
59
59
-
nend = len(nbytes)
83
83
+
remaining = grapheme.slice(remaining, room, grapheme.length(remaining))
84
84
+
if remaining:
85
85
+
new_block()
86
86
+
continue
87
87
+
if isinstance(tk, TextToken):
88
88
+
for seg in ALTERNATE.findall(tk.text):
89
89
+
seg_len = grapheme.length(seg)
60
90
61
61
-
nfg = replace(fg, start=nstart, end=nend)
62
62
-
nfragments.append(nfg)
91
91
+
if length + seg_len <= max_chars - (0 if seg.isspace() else 1):
92
92
+
append_text(seg)
93
93
+
length += seg_len
94
94
+
continue
63
95
64
64
-
last_index = fg.end
96
96
+
if length:
97
97
+
new_block()
65
98
66
66
-
if last_index < len(btext):
67
67
-
nbytes.extend(btext[last_index:])
99
99
+
if not seg.isspace():
100
100
+
while grapheme.length(seg) > max_chars - 1:
101
101
+
chunk = grapheme.slice(seg, 0, max_chars - 1) + "-"
102
102
+
append_text(chunk)
103
103
+
new_block()
104
104
+
seg = grapheme.slice(seg, max_chars - 1, grapheme.length(seg))
105
105
+
else:
106
106
+
while grapheme.length(seg) > max_chars:
107
107
+
chunk = grapheme.slice(seg, 0, max_chars)
108
108
+
append_text(chunk)
109
109
+
new_block()
110
110
+
seg = grapheme.slice(seg, max_chars, grapheme.length(seg))
68
111
69
69
-
return nbytes.decode('utf-8'), nfragments
112
112
+
if seg:
113
113
+
append_text(seg)
114
114
+
length = grapheme.length(seg)
115
115
+
continue
116
116
+
block.append(tk)
117
117
+
if block:
118
118
+
blocks.append(block)
70
119
71
71
-
def split(
72
72
-
self, text: str, fragments: list[Fragment]
73
73
-
) -> list[tuple[str, list[Fragment]]]:
74
74
-
text, fragments = self.url_normalize(text, fragments)
75
75
-
if self.tally_lenght((text, fragments)) <= self.climit:
76
76
-
return [(text, fragments)]
120
120
+
return blocks