Add duplicate detection with fuzzy matching and fix artist metadata extraction
This commit is contained in:
97
lib/fuzzyMatch.ts
Normal file
97
lib/fuzzyMatch.ts
Normal file
@@ -0,0 +1,97 @@
|
||||
/**
|
||||
* Fuzzy string matching utility for duplicate detection
|
||||
* Uses Levenshtein distance to compare strings with tolerance for formatting variations
|
||||
*/
|
||||
|
||||
/**
|
||||
* Normalize a string for comparison
|
||||
* - Converts to lowercase
|
||||
* - Removes special characters
|
||||
* - Normalizes whitespace
|
||||
*/
|
||||
function normalizeString(str: string): string {
|
||||
return str
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9\s]/g, '') // Remove special chars
|
||||
.replace(/\s+/g, ' ') // Normalize whitespace
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate Levenshtein distance between two strings
|
||||
* Returns the minimum number of single-character edits needed to change one string into the other
|
||||
*/
|
||||
function levenshteinDistance(a: string, b: string): number {
|
||||
if (a.length === 0) return b.length;
|
||||
if (b.length === 0) return a.length;
|
||||
|
||||
const matrix: number[][] = [];
|
||||
|
||||
// Initialize first column
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
|
||||
// Initialize first row
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
|
||||
// Fill in the rest of the matrix
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
} else {
|
||||
matrix[i][j] = Math.min(
|
||||
matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two strings are similar based on Levenshtein distance
|
||||
* @param str1 First string to compare
|
||||
* @param str2 Second string to compare
|
||||
* @param threshold Similarity threshold (0-1), default 0.85
|
||||
* @returns true if strings are similar enough
|
||||
*/
|
||||
export function isSimilar(str1: string, str2: string, threshold = 0.85): boolean {
|
||||
if (!str1 || !str2) return false;
|
||||
|
||||
const norm1 = normalizeString(str1);
|
||||
const norm2 = normalizeString(str2);
|
||||
|
||||
// Exact match after normalization
|
||||
if (norm1 === norm2) return true;
|
||||
|
||||
const distance = levenshteinDistance(norm1, norm2);
|
||||
const maxLen = Math.max(norm1.length, norm2.length);
|
||||
|
||||
// Avoid division by zero
|
||||
if (maxLen === 0) return true;
|
||||
|
||||
const similarity = 1 - (distance / maxLen);
|
||||
|
||||
return similarity >= threshold;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a song (artist + title) is a duplicate of another
|
||||
* Both artist AND title must be similar for a match
|
||||
*/
|
||||
export function isDuplicateSong(
|
||||
artist1: string,
|
||||
title1: string,
|
||||
artist2: string,
|
||||
title2: string,
|
||||
threshold = 0.85
|
||||
): boolean {
|
||||
return isSimilar(artist1, artist2, threshold) && isSimilar(title1, title2, threshold);
|
||||
}
|
||||
Reference in New Issue
Block a user