···11-# Implementation Plan: Replace lande with franc
22-33-## Overview
44-Replace the `lande` library with `franc` for language detection in the `getLanguage` function located in `src/utils.ts`.
55-66-## Current State Analysis
77-- **Current Library**: `lande` v1.0.10
88-- **Function Location**: `src/utils.ts:67-92`
99-- **Current Implementation**:
1010- - Uses dynamic import: `const lande = (await import("lande")).default;`
1111- - Returns a probability map sorted by likelihood
1212- - Returns the language code with highest probability
1313- - Defaults to "eng" for empty or invalid input
1414-1515-## Implementation Steps
1616-1717-### 1. Research & Dependencies
1818-- **franc** is a natural language detection library similar to `lande`
1919-- Supports 187 languages (ISO 639-3 codes)
2020-- Smaller footprint and better maintained than `lande`
2121-- Returns ISO 639-3 codes (3-letter codes like "eng", "fra", "spa")
2222-2323-### 2. Code Changes Required
2424-2525-#### Step 2.1: Update package.json
2626-- Remove: `"lande": "^1.0.10"`
2727-- Add: `"franc": "^6.2.0"` (latest stable version)
2828-2929-#### Step 2.2: Modify getLanguage function
3030-```typescript
3131-// Before (lines 82-92)
3232-const lande = (await import("lande")).default;
3333-let langsProbabilityMap = lande(profileText);
3434-langsProbabilityMap.sort(...);
3535-return langsProbabilityMap[0][0];
3636-3737-// After
3838-const { franc } = await import("franc");
3939-const detectedLang = franc(profileText);
4040-return detectedLang === "und" ? "eng" : detectedLang;
4141-```
4242-4343-### 3. Key Differences & Considerations
4444-4545-#### API Differences:
4646-- **lande**: Returns array of `[language, probability]` tuples
4747-- **franc**: Returns single language code or "und" (undetermined)
4848-4949-#### Return Values:
5050-- Both libraries use ISO 639-3 codes (3-letter codes)
5151-- franc returns "und" for undetermined text (we'll map to "eng" default)
5252-5353-### 4. Testing Strategy
5454-1. Test with empty string → should return "eng"
5555-2. Test with invalid input (null/undefined) → should return "eng"
5656-3. Test with English text → should return "eng"
5757-4. Test with other language samples → verify correct detection
5858-5. Test with mixed language text → verify reasonable detection
5959-6060-### 5. Rollback Plan
6161-If issues arise:
6262-1. Keep the original `lande` code commented
6363-2. Can quickly revert by uncommenting old code and reinstalling `lande`
6464-6565-## Implementation Order
6666-1. ✅ Analyze current implementation
6767-2. ✅ Research franc library compatibility
6868-3. 📝 Create this implementation plan
6969-4. Update package.json to replace lande with franc
7070-5. Modify getLanguage function in src/utils.ts
7171-6. Run lint and format checks
7272-7. Test the changes manually or with existing tests
7373-7474-## Risk Assessment
7575-- **Low Risk**: Direct replacement with similar functionality
7676-- **Compatibility**: Both libraries use ISO 639-3 codes
7777-- **Performance**: franc is generally faster and lighter
7878-- **Maintenance**: franc is more actively maintained
-1
PRD.md
···11-Replace lande with franc for language handling in export async function getLanguage. This is found in the file `src/utils.ts`.
+1-2
src/agent.ts
···11-import { AtpAgent } from "@atproto/api";
21import { setGlobalDispatcher, Agent as Agent } from "undici";
33-42setGlobalDispatcher(new Agent({ connect: { timeout: 20_000 } }));
53import { BSKY_HANDLE, BSKY_PASSWORD, OZONE_PDS } from "./config.js";
44+import { AtpAgent } from "@atproto/api";
6576export const agent = new AtpAgent({
87 service: `https://${OZONE_PDS}`,
+10-20
src/checkHandles.ts
···1111 handle: string,
1212 time: number,
1313) => {
1414- // Get a list of labels
1515- const labels: string[] = Array.from(
1616- HANDLE_CHECKS,
1717- (handleCheck) => handleCheck.label,
1818- );
1919-2020- // iterate through the labels
2121- labels.forEach((label) => {
2222- const checkList = HANDLE_CHECKS.find(
2323- (handleCheck) => handleCheck.label === label,
2424- );
2525-2626- if (checkList?.ignoredDIDs) {
1414+ // iterate through the checks
1515+ HANDLE_CHECKS.forEach((checkList) => {
1616+ if (checkList.ignoredDIDs) {
2717 if (checkList.ignoredDIDs.includes(did)) {
2818 logger.info(`Whitelisted DID: ${did}`);
2919 return;
3020 }
3121 }
32223333- if (checkList!.check.test(handle)) {
2323+ if (checkList.check.test(handle)) {
3424 // False-positive checks
3535- if (checkList?.whitelist) {
3636- if (checkList?.whitelist.test(handle)) {
2525+ if (checkList.whitelist) {
2626+ if (checkList.whitelist.test(handle)) {
3727 logger.info(`Whitelisted phrase found for: ${handle}`);
3828 return;
3929 }
4030 }
41314242- if (checkList?.toLabel === true) {
3232+ if (checkList.toLabel === true) {
4333 logger.info(`[CHECKHANDLE]: Labeling ${did} for ${checkList.label}`);
4434 {
4535 createAccountLabel(
4636 did,
4747- checkList.label,
3737+ `${checkList.label}`,
4838 `${time}: ${checkList.comment} - ${handle}`,
4939 );
5040 }
5141 }
52425353- if (checkList?.reportAcct === true) {
4343+ if (checkList.reportAcct === true) {
5444 logger.info(`[CHECKHANDLE]: Reporting ${did} for ${checkList.label}`);
5545 createAccountReport(did, `${time}: ${checkList.comment} - ${handle}`);
5646 }
57475858- if (checkList?.commentAcct === true) {
4848+ if (checkList.commentAcct === true) {
5949 logger.info(
6050 `[CHECKHANDLE]: Commenting on ${did} for ${checkList.label}`,
6151 );
+26-35
src/checkPosts.ts
···11import { LINK_SHORTENER, POST_CHECKS } from "./constants.js";
22+import { Post } from "./types.js";
33+import logger from "./logger.js";
24import { countStarterPacks } from "./count.js";
33-import logger from "./logger.js";
45import {
56 createPostLabel,
67 createAccountReport,
78 createAccountComment,
89 createPostReport,
910} from "./moderation.js";
1010-import type { Post } from "./types.js";
1111import { getFinalUrl, getLanguage } from "./utils.js";
12121313export const checkPosts = async (post: Post[]) => {
1414- // Get a list of labels
1515- const labels: string[] = Array.from(
1616- POST_CHECKS,
1717- (postCheck) => postCheck.label,
1818- );
1919-2014 const urlRegex = /https?:\/\/[^\s]+/g;
21152216 // Check for link shorteners
···4539 // Get the post's language
4640 const lang = await getLanguage(post[0].text);
47414848- // iterate through the labels
4949- labels.forEach((label) => {
5050- const checkPost = POST_CHECKS.find(
5151- (postCheck) => postCheck.label === label,
5252- );
5353-5454- if (checkPost?.language || checkPost?.language !== undefined) {
5555- if (!checkPost?.language.includes(lang)) {
4242+ // iterate through the checks
4343+ POST_CHECKS.forEach((checkPost) => {
4444+ if (checkPost.language) {
4545+ if (!checkPost.language.includes(lang)) {
5646 return;
5747 }
5848 }
59496060- if (checkPost?.ignoredDIDs) {
6161- if (checkPost?.ignoredDIDs.includes(post[0].did)) {
5050+ if (checkPost.ignoredDIDs) {
5151+ if (checkPost.ignoredDIDs.includes(post[0].did)) {
6252 logger.info(`[CHECKPOSTS]: Whitelisted DID: ${post[0].did}`);
6353 return;
6454 }
6555 }
66566767- if (checkPost!.check.test(post[0].text)) {
5757+ if (checkPost.check.test(post[0].text)) {
6858 // Check if post is whitelisted
6969- if (checkPost?.whitelist) {
7070- if (checkPost?.whitelist.test(post[0].text)) {
7171- logger.info("[CHECKPOSTS]: Whitelisted phrase found\"");
5959+ if (checkPost.whitelist) {
6060+ if (checkPost.whitelist.test(post[0].text)) {
6161+ logger.info(`[CHECKPOSTS]: Whitelisted phrase found"`);
7262 return;
7363 }
7464 }
75657666 countStarterPacks(post[0].did, post[0].time);
77677878- if (checkPost!.toLabel) {
6868+ if (checkPost.toLabel === true) {
7969 logger.info(
8080- `[CHECKPOSTS]: Labeling ${post[0].atURI} for ${checkPost!.label}`,
7070+ `[CHECKPOSTS]: Labeling ${post[0].atURI} for ${checkPost.label}`,
8171 );
8272 createPostLabel(
8373 post[0].atURI,
8474 post[0].cid,
8585- checkPost!.label,
8686- `${post[0].time}: ${checkPost!.comment} at ${post[0].atURI} with text "${post[0].text}"`,
7575+ `${checkPost.label}`,
7676+ `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`,
7777+ checkPost.duration,
8778 );
8879 }
89809090- if (checkPost!.reportPost === true) {
8181+ if (checkPost.reportPost === true) {
9182 logger.info(
9292- `[CHECKPOSTS]: Reporting ${post[0].atURI} for ${checkPost!.label}`,
8383+ `[CHECKPOSTS]: Reporting ${post[0].atURI} for ${checkPost.label}`,
9384 );
9485 logger.info(`Reporting: ${post[0].atURI}`);
9586 createPostReport(
9687 post[0].atURI,
9788 post[0].cid,
9898- `${post[0].time}: ${checkPost!.comment} at ${post[0].atURI} with text "${post[0].text}"`,
8989+ `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`,
9990 );
10091 }
10192102102- if (checkPost!.reportAcct) {
9393+ if (checkPost.reportAcct === true) {
10394 logger.info(
104104- `[CHECKPOSTS]: Reporting on ${post[0].did} for ${checkPost!.label} in ${post[0].atURI}`,
9595+ `[CHECKPOSTS]: Reporting on ${post[0].did} for ${checkPost.label} in ${post[0].atURI}`,
10596 );
10697 createAccountReport(
10798 post[0].did,
108108- `${post[0].time}: ${checkPost?.comment} at ${post[0].atURI} with text "${post[0].text}"`,
9999+ `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`,
109100 );
110101 }
111102112112- if (checkPost!.commentAcct) {
103103+ if (checkPost.commentAcct === true) {
113104 logger.info(
114114- `[CHECKPOSTS]: Commenting on ${post[0].did} for ${checkPost!.label} in ${post[0].atURI}`,
105105+ `[CHECKPOSTS]: Commenting on ${post[0].did} for ${checkPost.label} in ${post[0].atURI}`,
115106 );
116107 createAccountComment(
117108 post[0].did,
118118- `${post[0].time}: ${checkPost?.comment} at ${post[0].atURI} with text "${post[0].text}"`,
109109+ `${post[0].time}: ${checkPost.comment} at ${post[0].atURI} with text "${post[0].text}"`,
119110 );
120111 }
121112 }
+22-41
src/checkProfiles.ts
···1616) => {
1717 const lang = await getLanguage(description);
18181919- const labels: string[] = Array.from(
2020- PROFILE_CHECKS,
2121- (profileCheck) => profileCheck.label,
2222- );
2323-2424- // iterate through the labels
2525- labels.forEach((label) => {
2626- const checkProfiles = PROFILE_CHECKS.find(
2727- (profileCheck) => profileCheck.label === label,
2828- );
2929-3030- if (checkProfiles?.language || checkProfiles?.language !== undefined) {
3131- if (!checkProfiles?.language.includes(lang)) {
1919+ // iterate through the checks
2020+ PROFILE_CHECKS.forEach((checkProfiles) => {
2121+ if (checkProfiles.language) {
2222+ if (!checkProfiles.language.includes(lang)) {
3223 return;
3324 }
3425 }
35263627 // Check if DID is whitelisted
3737- if (checkProfiles?.ignoredDIDs) {
2828+ if (checkProfiles.ignoredDIDs) {
3829 if (checkProfiles.ignoredDIDs.includes(did)) {
3930 logger.info(`[CHECKDESCRIPTION]: Whitelisted DID: ${did}`);
4031 return;
···4233 }
43344435 if (description) {
4545- if (checkProfiles?.description === true) {
3636+ if (checkProfiles.description === true) {
4637 if (checkProfiles.check.test(description)) {
4738 // Check if description is whitelisted
4839 if (checkProfiles.whitelist) {
4940 if (checkProfiles.whitelist.test(description)) {
5050- logger.info("[CHECKDESCRIPTION]: Whitelisted phrase found.");
4141+ logger.info(`[CHECKDESCRIPTION]: Whitelisted phrase found.`);
5142 return;
5243 }
5344 }
54455555- if (checkProfiles.toLabel) {
4646+ if (checkProfiles.toLabel === true) {
5647 createAccountLabel(
5748 did,
5858- checkProfiles.label,
4949+ `${checkProfiles.label}`,
5950 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`,
6051 );
6152 logger.info(
···6354 );
6455 }
65566666- if (checkProfiles.reportAcct) {
5757+ if (checkProfiles.reportAcct === true) {
6758 createAccountReport(
6859 did,
6960 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`,
···7364 );
7465 }
75667676- if (checkProfiles.commentAcct) {
6767+ if (checkProfiles.commentAcct === true) {
7768 createAccountComment(
7869 did,
7970 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`,
···9687) => {
9788 const lang = await getLanguage(description);
98899999- // Get a list of labels
100100- const labels: string[] = Array.from(
101101- PROFILE_CHECKS,
102102- (profileCheck) => profileCheck.label,
103103- );
104104-105105- // iterate through the labels
106106- labels.forEach((label) => {
107107- const checkProfiles = PROFILE_CHECKS.find(
108108- (profileCheck) => profileCheck.label === label,
109109- );
110110-111111- if (checkProfiles?.language || checkProfiles?.language !== undefined) {
112112- if (!checkProfiles?.language.includes(lang)) {
9090+ // iterate through the checks
9191+ PROFILE_CHECKS.forEach((checkProfiles) => {
9292+ if (checkProfiles.language) {
9393+ if (!checkProfiles.language.includes(lang)) {
11394 return;
11495 }
11596 }
1169711798 // Check if DID is whitelisted
118118- if (checkProfiles?.ignoredDIDs) {
9999+ if (checkProfiles.ignoredDIDs) {
119100 if (checkProfiles.ignoredDIDs.includes(did)) {
120101 logger.info(`[CHECKDISPLAYNAME]: Whitelisted DID: ${did}`);
121102 return;
···123104 }
124105125106 if (displayName) {
126126- if (checkProfiles?.displayName === true) {
107107+ if (checkProfiles.displayName === true) {
127108 if (checkProfiles.check.test(displayName)) {
128109 // Check if displayName is whitelisted
129110 if (checkProfiles.whitelist) {
130111 if (checkProfiles.whitelist.test(displayName)) {
131131- logger.info("[CHECKDISPLAYNAME]: Whitelisted phrase found.");
112112+ logger.info(`[CHECKDISPLAYNAME]: Whitelisted phrase found.`);
132113 return;
133114 }
134115 }
135116136136- if (checkProfiles.toLabel) {
117117+ if (checkProfiles.toLabel === true) {
137118 createAccountLabel(
138119 did,
139139- checkProfiles.label,
120120+ `${checkProfiles.label}`,
140121 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`,
141122 );
142123 logger.info(
···144125 );
145126 }
146127147147- if (checkProfiles.reportAcct) {
128128+ if (checkProfiles.reportAcct === true) {
148129 createAccountReport(
149130 did,
150131 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`,
···154135 );
155136 }
156137157157- if (checkProfiles.commentAcct) {
138138+ if (checkProfiles.commentAcct === true) {
158139 createAccountComment(
159140 did,
160141 `${time}: ${checkProfiles.comment} - ${displayName} - ${description}`,
+8-8
src/checkStarterPack.ts
···2626 // Check if DID is whitelisted
2727 if (checkProfiles?.ignoredDIDs) {
2828 if (checkProfiles.ignoredDIDs.includes(did)) {
2929- logger.info(`Whitelisted DID: ${did}`); return;
2929+ return logger.info(`Whitelisted DID: ${did}`);
3030 }
3131 }
3232···3636 logger.info(`Account joined via starter pack at: ${atURI}`);
3737 createAccountLabel(
3838 did,
3939- checkProfiles.label,
4040- `${time}: ${checkProfiles.comment} - Account joined via starter pack at: ${atURI}`,
3939+ `${checkProfiles!.label}`,
4040+ `${time}: ${checkProfiles!.comment} - Account joined via starter pack at: ${atURI}`,
4141 );
4242 }
4343 }
···6565 createPostLabel(
6666 atURI,
6767 cid,
6868- checkList.label,
6969- `${time}: Starter pack created by known vector for ${checkList.label} at: ${atURI}"`,
6868+ `${checkList!.label}`,
6969+ `${time}: Starter pack created by known vector for ${checkList!.label} at: ${atURI}"`,
7070 );
7171 createAccountReport(
7272 did,
7373- `${time}: Starter pack created by known vector for ${checkList.label} at: ${atURI}"`,
7373+ `${time}: Starter pack created by known vector for ${checkList!.label} at: ${atURI}"`,
7474 );
7575 }
7676···8080 createPostLabel(
8181 atURI,
8282 cid,
8383- checkList!.label,
8383+ `${checkList!.label}`,
8484 `${time}: ${checkList!.comment} at ${atURI} with text "${description}"`,
8585 );
8686 createAccountReport(
···9696 createPostLabel(
9797 atURI,
9898 cid,
9999- checkList!.label,
9999+ `${checkList!.label}`,
100100 `${time}: ${checkList!.comment} at ${atURI} with pack name "${packName}"`,
101101 );
102102 createAccountReport(
···11import { isLoggedIn, agent } from "./agent.js";
22-import { limit } from "./limits.js";
32import logger from "./logger.js";
33+import { limit } from "./limits.js";
44import { createAccountLabel } from "./moderation.js";
5566export const countStarterPacks = async (did: string, time: number) => {
+4-8
src/developing_checks.md
···11# How to build checks for skywatch-automod
2233## Introduction
44-54Constants.ts defines three types of types of checks: `HANDLE_CHECKS`, `POST_CHECKS`, and `PROFILE_CHECKS`.
6576For each check, users need to define a set of regular expressions that will be used to match against the content of the post, handle, or profile. A maximal example of a check is as follows:
···98```typescript
109export const HANDLE_CHECKS: Checks[] = [
1110 {
1212- language: "[eng]", // Language of the check. If the check language does not match the content language, the check will be skipped. Assign null or remove field to apply to all languages.
1311 label: "example",
1412 comment: "Example found in handle",
1513 description: true, // Optional, only used in handle checks
1614 displayName: true, // Optional, only used in handle checks
1717- reportAcct: false, // if true, the check will only report the content against the account, not label.
1818- reportPost: false, // if true, the check will only report the content against the post, not label. Only used in post checks.
1919- commentOnly: false, // if true, will generate an account level comment from flagged posts, rather than a report. Intended for use when reportAcct is false, and on posts only where the flag may generate a high volume of reports.
2020- toLabel: true, // Should the handle in question be labeled if check evaluates to true.
1515+ reportOnly: false, // it true, the check will only report the content against the account, not label.
1616+ commentOnly: false, // Poorly named, if true, will generate an account level comment from flagged posts, rather than a report. Intended for use when reportOnly is false, and on posts only where the flag may generate a high volume of reports..
2117 check: new RegExp("example", "i"), // Regular expression to match against the content
2218 whitelist: new RegExp("example.com", "i"), // Optional, regular expression to whitelist content
2323- ignoredDIDs: ["did:plc:example"], // Optional, array of DIDs to ignore if they match the check. Useful for folks who reclaim words or accounts which may be false positives.
2424- },
1919+ ignoredDIDs: ["did:plc:example"] // Optional, array of DIDs to ignore if they match the check. Useful for folks who reclaim words.
2020+ }
2521];
2622```
2723
···11-import { describe, it, expect } from "vitest";
22-import { getLanguage } from "../src/utils.js";
33-44-describe("Critical moderation language detection", () => {
55- describe("English vs French 'retard' disambiguation", () => {
66- it("should detect French when 'retard' is used in French context (meaning 'delay')", async () => {
77- const frenchContexts = [
88- "Le train a du retard aujourd'hui",
99- "Il y a un retard de livraison",
1010- "Désolé pour le retard",
1111- "Mon vol a trois heures de retard",
1212- "Le retard est dû à la météo",
1313- "J'ai un retard de 15 minutes",
1414- "Le projet prend du retard",
1515- "Nous avons accumulé du retard",
1616- "Sans retard s'il vous plaît",
1717- "Le retard n'est pas acceptable",
1818- ];
1919-2020- for (const text of frenchContexts) {
2121- const result = await getLanguage(text);
2222- // Should detect as French (fra) or potentially other Romance languages, but NOT English
2323- expect(result).not.toBe("eng");
2424- // Most likely to be detected as French
2525- expect(["fra", "cat", "spa", "ita", "por", "ron"].includes(result)).toBe(true);
2626- }
2727- });
2828-2929- it("should detect English when 'retard' is used in English offensive context", async () => {
3030- const englishContexts = [
3131- "Don't be such a retard about it",
3232- "That's completely retarded logic",
3333- "Stop acting like a retard",
3434- "What a retard move that was",
3535- "Only a retard would think that",
3636- ];
3737-3838- for (const text of englishContexts) {
3939- const result = await getLanguage(text);
4040- // Should detect as English or closely related Germanic languages
4141- expect(["eng", "sco", "nld", "afr", "deu"].includes(result)).toBe(true);
4242- }
4343- });
4444-4545- it("should handle mixed signals but lean towards context language", async () => {
4646- // French sentence structure with 'retard' should be French
4747- const frenchStructure = "Le retard du train";
4848- const result1 = await getLanguage(frenchStructure);
4949- expect(result1).not.toBe("eng");
5050-5151- // English sentence structure with 'retard' should be English
5252- const englishStructure = "The retard in the system";
5353- const result2 = await getLanguage(englishStructure);
5454- // May detect as English or Dutch/Germanic due to structure
5555- expect(["eng", "nld", "afr", "deu", "sco"].includes(result2)).toBe(true);
5656- });
5757-5858- it("should detect French for common French phrases with 'retard'", async () => {
5959- const commonFrenchPhrases = [
6060- "en retard",
6161- "du retard",
6262- "avec retard",
6363- "sans retard",
6464- "mon retard",
6565- "ton retard",
6666- "son retard",
6767- "notre retard",
6868- "votre retard",
6969- "leur retard",
7070- ];
7171-7272- for (const phrase of commonFrenchPhrases) {
7373- const result = await getLanguage(phrase);
7474- // Very short phrases might be harder to detect, but should not be English
7575- expect(result).not.toBe("eng");
7676- }
7777- });
7878-7979- it("should provide context for moderation decisions", async () => {
8080- // Test case that matters for moderation
8181- const testCases = [
8282- {
8383- text: "Je suis en retard pour le meeting",
8484- expectedLang: ["fra", "cat", "spa", "ita"],
8585- isOffensive: false,
8686- context: "French: I am late for the meeting"
8787- },
8888- {
8989- text: "You're being a retard about this",
9090- expectedLang: ["eng", "sco", "nld"],
9191- isOffensive: true,
9292- context: "English: Offensive slur usage"
9393- },
9494- {
9595- text: "Le retard mental est un terme médical désuet",
9696- expectedLang: ["fra", "cat", "spa"],
9797- isOffensive: false,
9898- context: "French: Medical terminology (outdated)"
9999- },
100100- {
101101- text: "That's so retarded dude",
102102- expectedLang: ["eng", "sco"],
103103- isOffensive: true,
104104- context: "English: Casual offensive usage"
105105- }
106106- ];
107107-108108- for (const testCase of testCases) {
109109- const result = await getLanguage(testCase.text);
110110-111111- // Check if detected language is in expected set
112112- const isExpectedLang = testCase.expectedLang.some(lang => result === lang);
113113-114114- if (!isExpectedLang) {
115115- console.log(`Warning: "${testCase.text}" detected as ${result}, expected one of ${testCase.expectedLang.join(', ')}`);
116116- }
117117-118118- // The key insight: if detected as French/Romance language, likely NOT offensive
119119- // if detected as English/Germanic, needs moderation review
120120- const needsModeration = ["eng", "sco", "nld", "afr", "deu"].includes(result);
121121-122122- // This aligns with whether the content is actually offensive
123123- if (testCase.isOffensive) {
124124- expect(needsModeration).toBe(true);
125125- }
126126- }
127127- });
128128- });
129129-130130- describe("Other ambiguous terms across languages", () => {
131131- it("should detect language for other potentially ambiguous terms", async () => {
132132- const ambiguousCases = [
133133- { text: "Elle a un chat noir", lang: "fra", meaning: "She has a black cat (French)" },
134134- { text: "Let's chat about it", lang: "eng", meaning: "Let's talk (English)" },
135135- { text: "Das Gift ist gefährlich", lang: "deu", meaning: "The poison is dangerous (German)" },
136136- { text: "I got a gift for you", lang: "eng", meaning: "I got a present (English)" },
137137- { text: "El éxito fue grande", lang: "spa", meaning: "The success was great (Spanish)" },
138138- { text: "Take the exit here", lang: "eng", meaning: "Take the exit (English)" },
139139- ];
140140-141141- for (const testCase of ambiguousCases) {
142142- const result = await getLanguage(testCase.text);
143143- // Log for debugging but don't fail - language detection is probabilistic
144144- if (result !== testCase.lang) {
145145- console.log(`Note: "${testCase.text}" detected as ${result}, expected ${testCase.lang}`);
146146- }
147147- }
148148- });
149149- });
150150-});
-190
tests/utils.test.ts
···11-import { describe, it, expect, beforeEach, vi } from "vitest";
22-import { getLanguage } from "../src/utils.js";
33-44-// Mock the logger to avoid console output during tests
55-vi.mock("../src/logger.js", () => ({
66- default: {
77- warn: vi.fn(),
88- },
99-}));
1010-1111-describe("getLanguage", () => {
1212- beforeEach(() => {
1313- vi.clearAllMocks();
1414- });
1515-1616- describe("input validation", () => {
1717- it("should return 'eng' for null input", async () => {
1818- const result = await getLanguage(null as any);
1919- expect(result).toBe("eng");
2020- });
2121-2222- it("should return 'eng' for undefined input", async () => {
2323- const result = await getLanguage(undefined as any);
2424- expect(result).toBe("eng");
2525- });
2626-2727- it("should return 'eng' for number input", async () => {
2828- const result = await getLanguage(123 as any);
2929- expect(result).toBe("eng");
3030- });
3131-3232- it("should return 'eng' for empty string", async () => {
3333- const result = await getLanguage("");
3434- expect(result).toBe("eng");
3535- });
3636-3737- it("should return 'eng' for whitespace-only string", async () => {
3838- const result = await getLanguage(" \n\t ");
3939- expect(result).toBe("eng");
4040- });
4141- });
4242-4343- describe("language detection", () => {
4444- it("should detect English text", async () => {
4545- const englishText = "This is a sample English text that should be detected correctly.";
4646- const result = await getLanguage(englishText);
4747- expect(result).toBe("eng");
4848- });
4949-5050- it("should detect Spanish text", async () => {
5151- const spanishText = "Este es un texto de ejemplo en español que debe ser detectado correctamente.";
5252- const result = await getLanguage(spanishText);
5353- // franc may detect Galician (glg) for some Spanish text - both are valid Romance languages
5454- expect(["spa", "glg", "cat"].includes(result)).toBe(true);
5555- });
5656-5757- it("should detect French text", async () => {
5858- const frenchText = "Ceci est un exemple de texte en français qui devrait être détecté correctement.";
5959- const result = await getLanguage(frenchText);
6060- expect(result).toBe("fra");
6161- });
6262-6363- it("should detect German text", async () => {
6464- const germanText = "Dies ist ein deutscher Beispieltext, der korrekt erkannt werden sollte.";
6565- const result = await getLanguage(germanText);
6666- expect(result).toBe("deu");
6767- });
6868-6969- it("should detect Portuguese text", async () => {
7070- const portugueseText = "Este é um texto de exemplo em português que deve ser detectado corretamente.";
7171- const result = await getLanguage(portugueseText);
7272- expect(result).toBe("por");
7373- });
7474-7575- it("should detect Italian text", async () => {
7676- const italianText = "Questo è un testo di esempio in italiano che dovrebbe essere rilevato correttamente.";
7777- const result = await getLanguage(italianText);
7878- expect(result).toBe("ita");
7979- });
8080-8181- it("should detect Russian text", async () => {
8282- const russianText = "Это пример текста на русском языке, который должен быть правильно определен.";
8383- const result = await getLanguage(russianText);
8484- expect(result).toBe("rus");
8585- });
8686-8787- it("should detect Japanese text", async () => {
8888- const japaneseText = "これは正しく検出されるべき日本語のサンプルテキストです。";
8989- const result = await getLanguage(japaneseText);
9090- expect(result).toBe("jpn");
9191- });
9292-9393- it("should detect Chinese text", async () => {
9494- const chineseText = "这是一个应该被正确检测的中文示例文本。";
9595- const result = await getLanguage(chineseText);
9696- expect(result).toBe("cmn");
9797- });
9898-9999- it("should detect Arabic text", async () => {
100100- const arabicText = "هذا نص عينة باللغة العربية يجب اكتشافه بشكل صحيح.";
101101- const result = await getLanguage(arabicText);
102102- expect(result).toBe("arb");
103103- });
104104- });
105105-106106- describe("edge cases", () => {
107107- it("should return 'eng' for very short ambiguous text", async () => {
108108- const result = await getLanguage("hi");
109109- // Very short text might be undetermined
110110- expect(["eng", "hin", "und"].includes(result)).toBe(true);
111111- // If undetermined, should default to 'eng'
112112- if (result === "und") {
113113- expect(result).toBe("eng");
114114- }
115115- });
116116-117117- it("should handle mixed language text", async () => {
118118- const mixedText = "Hello world! Bonjour le monde! Hola mundo!";
119119- const result = await getLanguage(mixedText);
120120- // Should detect one of the languages or default to 'eng'
121121- expect(typeof result).toBe("string");
122122- expect(result.length).toBe(3);
123123- });
124124-125125- it("should handle gibberish text", async () => {
126126- const gibberish = "asdfghjkl qwerty zxcvbnm poiuytrewq";
127127- const result = await getLanguage(gibberish);
128128- // Franc may detect gibberish as various languages, not necessarily 'und'
129129- // Just ensure it returns a valid 3-letter language code
130130- expect(result).toMatch(/^[a-z]{3}$/);
131131- });
132132-133133- it("should handle text with emojis", async () => {
134134- const textWithEmojis = "Hello world! 👋 How are you? 😊";
135135- const result = await getLanguage(textWithEmojis);
136136- // Text with emojis should still be detected, though specific language may vary
137137- // Common English-like results include 'eng', 'fuf', 'sco'
138138- expect(result).toMatch(/^[a-z]{3}$/);
139139- });
140140-141141- it("should handle text with special characters", async () => {
142142- const textWithSpecialChars = "Hello @world! #testing $100 & more...";
143143- const result = await getLanguage(textWithSpecialChars);
144144- // Short text with special chars may be detected as various languages
145145- // Common results: 'eng', 'nld' (Dutch), 'afr' (Afrikaans)
146146- expect(["eng", "nld", "afr", "sco"].includes(result) || result.match(/^[a-z]{3}$/)).toBe(true);
147147- });
148148-149149- it("should handle text with URLs", async () => {
150150- const textWithUrls = "Check out this website: https://example.com for more information.";
151151- const result = await getLanguage(textWithUrls);
152152- expect(result).toBe("eng");
153153- });
154154-155155- it("should handle text with numbers", async () => {
156156- const textWithNumbers = "The year 2024 has 365 days and 12 months.";
157157- const result = await getLanguage(textWithNumbers);
158158- // May be detected as English, Scots, or other Germanic languages
159159- expect(["eng", "sco", "nld"].includes(result) || result.match(/^[a-z]{3}$/)).toBe(true);
160160- });
161161- });
162162-163163- describe("franc-specific behavior", () => {
164164- it("should return 'eng' when franc returns 'und'", async () => {
165165- // This tests the specific fallback logic for franc's "undetermined" response
166166- // Using a very short or ambiguous text that franc can't determine
167167- const ambiguousText = "xyz";
168168- const result = await getLanguage(ambiguousText);
169169- // Should either detect a language or fallback to 'eng' if 'und'
170170- expect(typeof result).toBe("string");
171171- expect(result.length).toBe(3);
172172- });
173173-174174- it("should always return a 3-letter ISO 639-3 language code", async () => {
175175- const texts = [
176176- "Hello world",
177177- "Bonjour le monde",
178178- "Hola mundo",
179179- "مرحبا بالعالم",
180180- "你好世界",
181181- "こんにちは世界",
182182- ];
183183-184184- for (const text of texts) {
185185- const result = await getLanguage(text);
186186- expect(result).toMatch(/^[a-z]{3}$/);
187187- }
188188- });
189189- });
190190-});