A tool for parsing traffic on the jetstream and applying a moderation workstream based on regexp based rules

feat: Split utils into separate files

This commit splits the `utils.ts` file into three separate files: -
`utils/utils.ts`: Contains the `normalizeUnicode` function. -
`utils/getFinalUrl.ts`: Contains the `getFinalUrl` function. -
`utils/getLanguage.ts`: Contains the `getLanguage` function. This
improves code organization and readability.

feat: Split utils into separate files

Skywatch 56cf6a37 98bae0e9

+94 -86
+2 -1
src/rules/posts/checkPosts.ts
··· 8 8 createAccountComment, 9 9 createPostReport, 10 10 } from "../../moderation.js"; 11 - import { getFinalUrl, getLanguage } from "../../utils.js"; 11 + import { getLanguage } from "../../utils/getLanguage.js"; 12 + import { getFinalUrl } from "../../utils/getFinalUrl.js"; 12 13 import { LINK_SHORTENER, GLOBAL_ALLOW } from "../../constants.js"; 13 14 14 15 export const checkPosts = async (post: Post[]) => {
-85
src/utils.ts
··· 1 - import { logger } from "./logger.js"; 2 - 3 - import { homoglyphMap } from "./homoglyphs.js"; 4 - 5 - /** 6 - * Normalizes a string by converting it to lowercase, replacing homoglyphs, 7 - * and stripping diacritics. This is useful for sanitizing user input 8 - * before performing checks for forbidden words. 9 - * 10 - * The process is as follows: 11 - * 1. Convert the entire string to lowercase. 12 - * 2. Replace characters that are visually similar to ASCII letters (homoglyphs) 13 - * with their ASCII counterparts based on the `homoglyphMap`. 14 - * 3. Apply NFD (Normalization Form D) Unicode normalization to decompose 15 - * characters into their base characters and combining marks. 16 - * 4. Remove all Unicode combining diacritical marks. 17 - * 5. Apply NFKC (Normalization Form KC) Unicode normalization for a final 18 - * cleanup, which handles compatibility characters. 19 - * 20 - * @param text The input string to normalize. 21 - * @returns The normalized string. 22 - */ 23 - export function normalizeUnicode(text: string): string { 24 - // Convert to lowercase to match the homoglyph map keys 25 - const lowercased = text.toLowerCase(); 26 - 27 - // Replace characters using the homoglyph map. 28 - // This is done before NFD so that pre-composed characters are caught. 29 - let replaced = ""; 30 - for (const char of lowercased) { 31 - replaced += homoglyphMap[char] || char; 32 - } 33 - 34 - // First decompose the characters (NFD), then remove diacritics. 35 - const withoutDiacritics = replaced 36 - .normalize("NFD") 37 - .replace(/[\u0300-\u036f]/g, ""); 38 - 39 - // Final NFKC normalization to handle any remaining special characters. 40 - return withoutDiacritics.normalize("NFKC"); 41 - } 42 - 43 - export async function getFinalUrl(url: string): Promise<string> { 44 - const controller = new AbortController(); 45 - const timeoutId = setTimeout(() => controller.abort(), 10000); // 10-second timeout 46 - 47 - try { 48 - const response = await fetch(url, { 49 - method: "HEAD", 50 - redirect: "follow", // This will follow redirects automatically 51 - signal: controller.signal, // Pass the abort signal to fetch 52 - }); 53 - clearTimeout(timeoutId); // Clear the timeout if fetch completes 54 - return response.url; // This will be the final URL after redirects 55 - } catch (error) { 56 - clearTimeout(timeoutId); // Clear the timeout if fetch fails 57 - // Log the error with more specific information if it's a timeout 58 - if (error instanceof Error && error.name === "AbortError") { 59 - logger.warn({ process: "UTILS", url, error }, "Timeout fetching URL"); 60 - } else { 61 - logger.warn({ process: "UTILS", url, error }, "Error fetching URL"); 62 - } 63 - throw error; // Re-throw the error to be caught by the caller 64 - } 65 - } 66 - 67 - export async function getLanguage(profile: string): Promise<string> { 68 - if (typeof profile !== "string") { 69 - logger.warn({ process: "UTILS", profile }, "getLanguage called with invalid profile data, defaulting to 'eng'"); 70 - return "eng"; // Default or throw an error 71 - } 72 - 73 - const profileText = profile.trim(); 74 - 75 - if (profileText.length === 0) { 76 - return "eng"; 77 - } 78 - 79 - const { franc } = await import("franc"); 80 - const detectedLang = franc(profileText); 81 - 82 - // franc returns "und" (undetermined) if it can't detect the language 83 - // Default to "eng" in such cases 84 - return detectedLang === "und" ? "eng" : detectedLang; 85 - }
+25
src/utils/getFinalUrl.ts
··· 1 + import { logger } from "../logger.js"; 2 + 3 + export async function getFinalUrl(url: string): Promise<string> { 4 + const controller = new AbortController(); 5 + const timeoutId = setTimeout(() => controller.abort(), 10000); // 10-second timeout 6 + 7 + try { 8 + const response = await fetch(url, { 9 + method: "HEAD", 10 + redirect: "follow", // This will follow redirects automatically 11 + signal: controller.signal, // Pass the abort signal to fetch 12 + }); 13 + clearTimeout(timeoutId); // Clear the timeout if fetch completes 14 + return response.url; // This will be the final URL after redirects 15 + } catch (error) { 16 + clearTimeout(timeoutId); // Clear the timeout if fetch fails 17 + // Log the error with more specific information if it's a timeout 18 + if (error instanceof Error && error.name === "AbortError") { 19 + logger.warn({ process: "UTILS", url, error }, "Timeout fetching URL"); 20 + } else { 21 + logger.warn({ process: "UTILS", url, error }, "Error fetching URL"); 22 + } 23 + throw error; // Re-throw the error to be caught by the caller 24 + } 25 + }
+24
src/utils/getLanguage.ts
··· 1 + import { logger } from "../logger.js"; 2 + 3 + export async function getLanguage(profile: string): Promise<string> { 4 + if (typeof profile !== "string") { 5 + logger.warn( 6 + { process: "UTILS", profile }, 7 + "getLanguage called with invalid profile data, defaulting to 'eng'", 8 + ); 9 + return "eng"; // Default or throw an error 10 + } 11 + 12 + const profileText = profile.trim(); 13 + 14 + if (profileText.length === 0) { 15 + return "eng"; 16 + } 17 + 18 + const { franc } = await import("franc"); 19 + const detectedLang = franc(profileText); 20 + 21 + // franc returns "und" (undetermined) if it can't detect the language 22 + // Default to "eng" in such cases 23 + return detectedLang === "und" ? "eng" : detectedLang; 24 + }
+43
src/utils/utils.ts
··· 1 + import { logger } from "../logger.js"; 2 + 3 + import { homoglyphMap } from "../homoglyphs.js"; 4 + 5 + /** 6 + * Normalizes a string by converting it to lowercase, replacing homoglyphs, 7 + * and stripping diacritics. This is useful for sanitizing user input 8 + * before performing checks for forbidden words. 9 + * 10 + * The process is as follows: 11 + * 1. Convert the entire string to lowercase. 12 + * 2. Replace characters that are visually similar to ASCII letters (homoglyphs) 13 + * with their ASCII counterparts based on the `homoglyphMap`. 14 + * 3. Apply NFD (Normalization Form D) Unicode normalization to decompose 15 + * characters into their base characters and combining marks. 16 + * 4. Remove all Unicode combining diacritical marks. 17 + * 5. Apply NFKC (Normalization Form KC) Unicode normalization for a final 18 + * cleanup, which handles compatibility characters. 19 + * 20 + * @param text The input string to normalize. 21 + * @returns The normalized string. 22 + */ 23 + export function normalizeUnicode(text: string): string { 24 + // Convert to lowercase to match the homoglyph map keys 25 + const lowercased = text.toLowerCase(); 26 + 27 + // Replace characters using the homoglyph map. 28 + // This is done before NFD so that pre-composed characters are caught. 29 + let replaced = ""; 30 + for (const char of lowercased) { 31 + replaced += homoglyphMap[char] || char; 32 + } 33 + 34 + // First decompose the characters (NFD), then remove diacritics. 35 + const withoutDiacritics = replaced 36 + .normalize("NFD") 37 + .replace(/[\u0300-\u036f]/g, ""); 38 + 39 + // Final NFKC normalization to handle any remaining special characters. 40 + return withoutDiacritics.normalize("NFKC"); 41 + } 42 + 43 +