/** * Fuzzy string matching utility for duplicate detection * Uses Levenshtein distance to compare strings with tolerance for formatting variations */ /** * Normalize a string for comparison * - Converts to lowercase * - Removes special characters * - Normalizes whitespace */ function normalizeString(str: string): string { return str .toLowerCase() .replace(/[^a-z0-9\s]/g, '') // Remove special chars .replace(/\s+/g, ' ') // Normalize whitespace .trim(); } /** * Calculate Levenshtein distance between two strings * Returns the minimum number of single-character edits needed to change one string into the other */ function levenshteinDistance(a: string, b: string): number { if (a.length === 0) return b.length; if (b.length === 0) return a.length; const matrix: number[][] = []; // Initialize first column for (let i = 0; i <= b.length; i++) { matrix[i] = [i]; } // Initialize first row for (let j = 0; j <= a.length; j++) { matrix[0][j] = j; } // Fill in the rest of the matrix for (let i = 1; i <= b.length; i++) { for (let j = 1; j <= a.length; j++) { if (b.charAt(i - 1) === a.charAt(j - 1)) { matrix[i][j] = matrix[i - 1][j - 1]; } else { matrix[i][j] = Math.min( matrix[i - 1][j - 1] + 1, // substitution matrix[i][j - 1] + 1, // insertion matrix[i - 1][j] + 1 // deletion ); } } } return matrix[b.length][a.length]; } /** * Check if two strings are similar based on Levenshtein distance * @param str1 First string to compare * @param str2 Second string to compare * @param threshold Similarity threshold (0-1), default 0.85 * @returns true if strings are similar enough */ export function isSimilar(str1: string, str2: string, threshold = 0.85): boolean { if (!str1 || !str2) return false; const norm1 = normalizeString(str1); const norm2 = normalizeString(str2); // Exact match after normalization if (norm1 === norm2) return true; const distance = levenshteinDistance(norm1, norm2); const maxLen = Math.max(norm1.length, norm2.length); // Avoid division by zero if (maxLen === 0) return true; const similarity = 1 - (distance / maxLen); return similarity >= threshold; } /** * Check if a song (artist + title) is a duplicate of another * Both artist AND title must be similar for a match */ export function isDuplicateSong( artist1: string, title1: string, artist2: string, title2: string, threshold = 0.85 ): boolean { return isSimilar(artist1, artist2, threshold) && isSimilar(title1, title2, threshold); }