New similarity module to help with comparing track titles

This commit is contained in:
Philipp Wolfer 2024-01-14 13:12:02 +01:00
parent d9d83a4282
commit bace31471e
No known key found for this signature in database
GPG key ID: 8FDF744D4919943B
4 changed files with 111 additions and 0 deletions

View file

@ -0,0 +1,55 @@
/*
Copyright © 2024 Philipp Wolfer <phw@uploadedlobster.com>
Scotty is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later version.
Scotty is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
Scotty. If not, see <https://www.gnu.org/licenses/>.
*/
package similarity
import (
"regexp"
"strings"
"github.com/agnivade/levenshtein"
"go.uploadedlobster.com/scotty/internal/util"
"golang.org/x/text/unicode/norm"
)
// Returns the Levensthein distance between s1 and s2 relative to the length of
// the longer string.
// Unicode normalization on the strings is performed.
func Similarity(s1 string, s2 string) float64 {
s1 = norm.NFKC.String(s1)
s2 = norm.NFKC.String(s2)
l1 := len([]rune(s1))
l2 := len([]rune(s2))
maxLen := util.Max(l1, l2)
// Empty strings always compare full equal
if maxLen == 0 {
return 1.0
}
dist := levenshtein.ComputeDistance(s1, s2)
// fmt.Printf("%v (%v) ~ %v (%v) = %v\n", s1, l1, s2, l2, dist)
return 1.0 - (float64(dist) / float64(maxLen))
}
var reExtraTitleInfo = regexp.MustCompile(`\([^)]+\)$`)
var reMultiSpace = regexp.MustCompile(`\s+`)
// Normalizes a track or release title.
func NormalizeTitle(s string) string {
s = strings.TrimSpace(s)
s = strings.ToLower(s)
s = reExtraTitleInfo.ReplaceAllString(s, "")
s = reMultiSpace.ReplaceAllString(s, " ")
return s
}

View file

@ -0,0 +1,51 @@
/*
Copyright © 2024 Philipp Wolfer <phw@uploadedlobster.com>
Scotty is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later version.
Scotty is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
Scotty. If not, see <https://www.gnu.org/licenses/>.
*/
package similarity_test
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"go.uploadedlobster.com/scotty/internal/similarity"
)
func ExampleSimilarity() {
s := similarity.Similarity("bar1", "bär1")
fmt.Println(s)
// Output: 0.75
}
func TestSimilarity(t *testing.T) {
assert := assert.New(t)
assert.Equal(1.0, similarity.Similarity("", ""))
assert.Equal(0.0, similarity.Similarity("foo", ""))
assert.Equal(0.0, similarity.Similarity("foo", "bar"))
assert.Equal(0.5, similarity.Similarity("foobar", "bar"))
assert.Equal(1.0, similarity.Similarity("foo", "foo"))
}
func ExampleNormalizeTitle() {
s := similarity.NormalizeTitle(" Forever \tFailure (video edit) ")
fmt.Println(s)
// Output: forever failure
}
func TestNormalizeTitle(t *testing.T) {
assert := assert.New(t)
assert.Equal("forever failure", similarity.NormalizeTitle("Forever Failure"))
assert.Equal("foo", similarity.NormalizeTitle(" \tfoo\t \t"))
}