···6868 comment: string; // Comment for the label
6969 expires?: string; // Optional expiration date (ISO 8601) - check will be skipped after this date
7070}
7171-
-85
src/utils.ts
···11-import { logger } from "./logger.js";
22-33-import { homoglyphMap } from "./homoglyphs.js";
44-55-/**
66- * Normalizes a string by converting it to lowercase, replacing homoglyphs,
77- * and stripping diacritics. This is useful for sanitizing user input
88- * before performing checks for forbidden words.
99- *
1010- * The process is as follows:
1111- * 1. Convert the entire string to lowercase.
1212- * 2. Replace characters that are visually similar to ASCII letters (homoglyphs)
1313- * with their ASCII counterparts based on the `homoglyphMap`.
1414- * 3. Apply NFD (Normalization Form D) Unicode normalization to decompose
1515- * characters into their base characters and combining marks.
1616- * 4. Remove all Unicode combining diacritical marks.
1717- * 5. Apply NFKC (Normalization Form KC) Unicode normalization for a final
1818- * cleanup, which handles compatibility characters.
1919- *
2020- * @param text The input string to normalize.
2121- * @returns The normalized string.
2222- */
2323-export function normalizeUnicode(text: string): string {
2424- // Convert to lowercase to match the homoglyph map keys
2525- const lowercased = text.toLowerCase();
2626-2727- // Replace characters using the homoglyph map.
2828- // This is done before NFD so that pre-composed characters are caught.
2929- let replaced = "";
3030- for (const char of lowercased) {
3131- replaced += homoglyphMap[char] || char;
3232- }
3333-3434- // First decompose the characters (NFD), then remove diacritics.
3535- const withoutDiacritics = replaced
3636- .normalize("NFD")
3737- .replace(/[\u0300-\u036f]/g, "");
3838-3939- // Final NFKC normalization to handle any remaining special characters.
4040- return withoutDiacritics.normalize("NFKC");
4141-}
4242-4343-export async function getFinalUrl(url: string): Promise<string> {
4444- const controller = new AbortController();
4545- const timeoutId = setTimeout(() => controller.abort(), 10000); // 10-second timeout
4646-4747- try {
4848- const response = await fetch(url, {
4949- method: "HEAD",
5050- redirect: "follow", // This will follow redirects automatically
5151- signal: controller.signal, // Pass the abort signal to fetch
5252- });
5353- clearTimeout(timeoutId); // Clear the timeout if fetch completes
5454- return response.url; // This will be the final URL after redirects
5555- } catch (error) {
5656- clearTimeout(timeoutId); // Clear the timeout if fetch fails
5757- // Log the error with more specific information if it's a timeout
5858- if (error instanceof Error && error.name === "AbortError") {
5959- logger.warn({ process: "UTILS", url, error }, "Timeout fetching URL");
6060- } else {
6161- logger.warn({ process: "UTILS", url, error }, "Error fetching URL");
6262- }
6363- throw error; // Re-throw the error to be caught by the caller
6464- }
6565-}
6666-6767-export async function getLanguage(profile: string): Promise<string> {
6868- if (typeof profile !== "string") {
6969- logger.warn({ process: "UTILS", profile }, "getLanguage called with invalid profile data, defaulting to 'eng'");
7070- return "eng"; // Default or throw an error
7171- }
7272-7373- const profileText = profile.trim();
7474-7575- if (profileText.length === 0) {
7676- return "eng";
7777- }
7878-7979- const { franc } = await import("franc");
8080- const detectedLang = franc(profileText);
8181-8282- // franc returns "und" (undetermined) if it can't detect the language
8383- // Default to "eng" in such cases
8484- return detectedLang === "und" ? "eng" : detectedLang;
8585-}
···11+import { describe, it, expect, vi, beforeEach } from "vitest";
22+import { getLanguage } from "./getLanguage.js";
33+44+// Mock the logger
55+vi.mock("../logger.js", () => ({
66+ logger: {
77+ warn: vi.fn(),
88+ },
99+}));
1010+1111+describe("getLanguage", () => {
1212+ beforeEach(() => {
1313+ vi.clearAllMocks();
1414+ });
1515+1616+ describe("language detection", () => {
1717+ it("should detect English text", async () => {
1818+ const text = "Hello world, this is a test of the English language.";
1919+ const result = await getLanguage(text);
2020+ expect(result).toBe("eng");
2121+ });
2222+2323+ it("should detect Spanish text", async () => {
2424+ const text =
2525+ "Hola mundo, esta es una prueba del idioma español con suficiente texto para detectar.";
2626+ const result = await getLanguage(text);
2727+ expect(result).toBe("spa");
2828+ });
2929+3030+ it("should detect French text", async () => {
3131+ const text =
3232+ "Bonjour le monde, ceci est un test de la langue française avec suffisamment de texte.";
3333+ const result = await getLanguage(text);
3434+ expect(result).toBe("fra");
3535+ });
3636+3737+ it("should detect German text", async () => {
3838+ const text =
3939+ "Hallo Welt, dies ist ein Test der deutschen Sprache mit genügend Text.";
4040+ const result = await getLanguage(text);
4141+ expect(result).toBe("deu");
4242+ });
4343+4444+ it("should detect Portuguese text", async () => {
4545+ const text =
4646+ "Olá mundo, este é um teste da língua portuguesa com texto suficiente para detecção.";
4747+ const result = await getLanguage(text);
4848+ expect(result).toBe("por");
4949+ });
5050+5151+ it("should detect Italian text", async () => {
5252+ const text =
5353+ "Ciao mondo, questo è un test della lingua italiana con abbastanza testo.";
5454+ const result = await getLanguage(text);
5555+ expect(result).toBe("ita");
5656+ });
5757+5858+ it("should detect Japanese text", async () => {
5959+ const text = "これは日本語のテストです。十分なテキストで言語を検出します。";
6060+ const result = await getLanguage(text);
6161+ expect(result).toBe("jpn");
6262+ });
6363+ });
6464+6565+ describe("edge cases", () => {
6666+ it("should default to eng for empty strings", async () => {
6767+ const result = await getLanguage("");
6868+ expect(result).toBe("eng");
6969+ });
7070+7171+ it("should default to eng for whitespace-only strings", async () => {
7272+ const result = await getLanguage(" ");
7373+ expect(result).toBe("eng");
7474+ });
7575+7676+ it("should default to eng for very short text", async () => {
7777+ const result = await getLanguage("hi");
7878+ expect(result).toBe("eng");
7979+ });
8080+8181+ it("should default to eng for undetermined language", async () => {
8282+ const result = await getLanguage("123 456 789");
8383+ expect(result).toBe("eng");
8484+ });
8585+8686+ it("should default to eng for symbols only", async () => {
8787+ const result = await getLanguage("!@#$%^&*()");
8888+ expect(result).toBe("eng");
8989+ });
9090+ });
9191+9292+ describe("invalid input handling", () => {
9393+ it("should handle non-string input gracefully", async () => {
9494+ const result = await getLanguage(123 as any);
9595+ expect(result).toBe("eng");
9696+ });
9797+9898+ it("should handle null input gracefully", async () => {
9999+ const result = await getLanguage(null as any);
100100+ expect(result).toBe("eng");
101101+ });
102102+103103+ it("should handle undefined input gracefully", async () => {
104104+ const result = await getLanguage(undefined as any);
105105+ expect(result).toBe("eng");
106106+ });
107107+108108+ it("should handle object input gracefully", async () => {
109109+ const result = await getLanguage({} as any);
110110+ expect(result).toBe("eng");
111111+ });
112112+113113+ it("should handle array input gracefully", async () => {
114114+ const result = await getLanguage([] as any);
115115+ expect(result).toBe("eng");
116116+ });
117117+ });
118118+119119+ describe("trimming behavior", () => {
120120+ it("should trim leading whitespace", async () => {
121121+ const text =
122122+ " Hello world, this is a test of the English language.";
123123+ const result = await getLanguage(text);
124124+ expect(result).toBe("eng");
125125+ });
126126+127127+ it("should trim trailing whitespace", async () => {
128128+ const text =
129129+ "Hello world, this is a test of the English language. ";
130130+ const result = await getLanguage(text);
131131+ expect(result).toBe("eng");
132132+ });
133133+134134+ it("should trim both leading and trailing whitespace", async () => {
135135+ const text =
136136+ " Hello world, this is a test of the English language. ";
137137+ const result = await getLanguage(text);
138138+ expect(result).toBe("eng");
139139+ });
140140+ });
141141+142142+ describe("mixed language text", () => {
143143+ it("should detect primary language in mixed content", async () => {
144144+ const text =
145145+ "This is primarily English text with some español words mixed in.";
146146+ const result = await getLanguage(text);
147147+ expect(result).toBe("eng");
148148+ });
149149+150150+ it("should handle code mixed with text", async () => {
151151+ const text =
152152+ "Here is some English text with const x = 123; code in it.";
153153+ const result = await getLanguage(text);
154154+ expect(result).toBe("eng");
155155+ });
156156+ });
157157+});
+24
src/utils/getLanguage.ts
···11+import { logger } from "../logger.js";
22+33+export async function getLanguage(profile: string): Promise<string> {
44+ if (typeof profile !== "string") {
55+ logger.warn(
66+ { process: "UTILS", profile },
77+ "getLanguage called with invalid profile data, defaulting to 'eng'",
88+ );
99+ return "eng"; // Default or throw an error
1010+ }
1111+1212+ const profileText = profile.trim();
1313+1414+ if (profileText.length === 0) {
1515+ return "eng";
1616+ }
1717+1818+ const { franc } = await import("franc");
1919+ const detectedLang = franc(profileText);
2020+2121+ // franc returns "und" (undetermined) if it can't detect the language
2222+ // Default to "eng" in such cases
2323+ return detectedLang === "und" ? "eng" : detectedLang;
2424+}
···11+import { logger } from "../logger.js";
22+33+import { homoglyphMap } from "./homoglyphs.js";
44+55+/**
66+ * Normalizes a string by converting it to lowercase, replacing homoglyphs,
77+ * and stripping diacritics. This is useful for sanitizing user input
88+ * before performing checks for forbidden words.
99+ *
1010+ * The process is as follows:
1111+ * 1. Convert the entire string to lowercase.
1212+ * 2. Replace characters that are visually similar to ASCII letters (homoglyphs)
1313+ * with their ASCII counterparts based on the `homoglyphMap`.
1414+ * 3. Apply NFD (Normalization Form D) Unicode normalization to decompose
1515+ * characters into their base characters and combining marks.
1616+ * 4. Remove all Unicode combining diacritical marks.
1717+ * 5. Apply NFKC (Normalization Form KC) Unicode normalization for a final
1818+ * cleanup, which handles compatibility characters.
1919+ *
2020+ * @param text The input string to normalize.
2121+ * @returns The normalized string.
2222+ */
2323+export function normalizeUnicode(text: string): string {
2424+ // Convert to lowercase to match the homoglyph map keys
2525+ const lowercased = text.toLowerCase();
2626+2727+ // Replace characters using the homoglyph map.
2828+ // This is done before NFD so that pre-composed characters are caught.
2929+ let replaced = "";
3030+ for (const char of lowercased) {
3131+ replaced += homoglyphMap[char] || char;
3232+ }
3333+3434+ // First decompose the characters (NFD), then remove diacritics.
3535+ const withoutDiacritics = replaced
3636+ .normalize("NFD")
3737+ .replace(/[\u0300-\u036f]/g, "");
3838+3939+ // Final NFKC normalization to handle any remaining special characters.
4040+ return withoutDiacritics.normalize("NFKC");
4141+}
4242+4343+