Skip to content

Commit

Permalink
perf(parsing): use one-pass substitution (re.sub) to speed up emojify()
Browse files Browse the repository at this point in the history
Signed-off-by: Rongrong <[email protected]>
  • Loading branch information
Rongronggg9 committed Jun 10, 2024
1 parent 84ce41f commit 6ae43af
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions src/parsing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@
sorted(set(SPACES + INVALID_CHARACTERS + string.punctuation + string.whitespace))
)

# false positive:
# noinspection RegExpUnnecessaryNonCapturingGroup
EMOJIFY_RE: Final[re.Pattern] = re.compile(rf'\[(?:{"|".join(re.escape(phrase[1:-1]) for phrase in EMOJIFY_MAP)})]')
emojifyReSub = partial(EMOJIFY_RE.sub, lambda match: EMOJIFY_MAP[match.group(0)])

replaceInvalidCharacter = partial(re.compile(rf'[{INVALID_CHARACTERS}]').sub, ' ') # use initially
replaceSpecialSpace = partial(re.compile(rf'[{SPACES[1:]}]').sub, ' ') # use carefully
stripBr = partial(re.compile(r'\s*<br\s*/?\s*>\s*').sub, '<br>')
Expand Down Expand Up @@ -115,11 +120,9 @@ def resolve_relative_link(base: Optional[str], url: Optional[str]) -> str:


def emojify(xml):
xml = emojize(xml, language='alias', variant='emoji_type')
for emoticon_phrase, emoji in EMOJIFY_MAP.items():
# emojify weibo emoticons, get all here: https://api.weibo.com/2/emotions.json?source=1362404091
xml = xml.replace(emoticon_phrase, emoji)
return xml
return emojifyReSub(
emojize(xml, language='alias', variant='emoji_type')
)


def is_emoticon(tag: Tag) -> bool:
Expand Down

0 comments on commit 6ae43af

Please sign in to comment.