From bace31471e9402578ad8f9c87715721e114d8cd8 Mon Sep 17 00:00:00 2001 From: Philipp Wolfer Date: Sun, 14 Jan 2024 13:12:02 +0100 Subject: [PATCH] New similarity module to help with comparing track titles --- go.mod | 1 + go.sum | 4 ++ internal/similarity/similarity.go | 55 ++++++++++++++++++++++++++ internal/similarity/similarity_test.go | 51 ++++++++++++++++++++++++ 4 files changed, 111 insertions(+) create mode 100644 internal/similarity/similarity.go create mode 100644 internal/similarity/similarity_test.go diff --git a/go.mod b/go.mod index 4c5487f..bbfa37e 100644 --- a/go.mod +++ b/go.mod @@ -28,6 +28,7 @@ require ( require ( github.com/VividCortex/ewma v1.2.0 // indirect github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect + github.com/agnivade/levenshtein v1.1.1 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dustin/go-humanize v1.0.1 // indirect diff --git a/go.sum b/go.sum index fd1332d..4d08441 100644 --- a/go.sum +++ b/go.sum @@ -44,6 +44,9 @@ github.com/Xuanwo/go-locale v1.1.0 h1:51gUxhxl66oXAjI9uPGb2O0qwPECpriKQb2hl35mQk github.com/Xuanwo/go-locale v1.1.0/go.mod h1:UKrHoZB3FPIk9wIG2/tVSobnHgNnceGSH3Y8DY5cASs= github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d h1:licZJFw2RwpHMqeKTCYkitsPqHNxTmd4SNR5r94FGM8= github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d/go.mod h1:asat636LX7Bqt5lYEZ27JNDcqxfjdBQuJ/MM4CN/Lzo= +github.com/agnivade/levenshtein v1.1.1 h1:QY8M92nrzkmr798gCo3kmMyqXFzdQVpxLlGPRBij0P8= +github.com/agnivade/levenshtein v1.1.1/go.mod h1:veldBMzWxcCG2ZvUTKD2kJNRdCk5hVbJomOvKkmgYbo= +github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= @@ -64,6 +67,7 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/delucks/go-subsonic v0.0.0-20220915164742-2744002c4be5 h1:RuuxidatioSKGOiBzL1mTY4X22DQD8weEbS3iRLHnAg= github.com/delucks/go-subsonic v0.0.0-20220915164742-2744002c4be5/go.mod h1:vnbEuj6Z20PLcHB4rrLQAOXGMjtULfMGhRVSFPcSdUo= +github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= diff --git a/internal/similarity/similarity.go b/internal/similarity/similarity.go new file mode 100644 index 0000000..8e8536d --- /dev/null +++ b/internal/similarity/similarity.go @@ -0,0 +1,55 @@ +/* +Copyright © 2024 Philipp Wolfer + +Scotty is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later version. + +Scotty is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +Scotty. If not, see . +*/ + +package similarity + +import ( + "regexp" + "strings" + + "github.com/agnivade/levenshtein" + "go.uploadedlobster.com/scotty/internal/util" + "golang.org/x/text/unicode/norm" +) + +// Returns the Levensthein distance between s1 and s2 relative to the length of +// the longer string. +// Unicode normalization on the strings is performed. +func Similarity(s1 string, s2 string) float64 { + s1 = norm.NFKC.String(s1) + s2 = norm.NFKC.String(s2) + l1 := len([]rune(s1)) + l2 := len([]rune(s2)) + maxLen := util.Max(l1, l2) + // Empty strings always compare full equal + if maxLen == 0 { + return 1.0 + } + dist := levenshtein.ComputeDistance(s1, s2) + // fmt.Printf("%v (%v) ~ %v (%v) = %v\n", s1, l1, s2, l2, dist) + return 1.0 - (float64(dist) / float64(maxLen)) +} + +var reExtraTitleInfo = regexp.MustCompile(`\([^)]+\)$`) +var reMultiSpace = regexp.MustCompile(`\s+`) + +// Normalizes a track or release title. +func NormalizeTitle(s string) string { + s = strings.TrimSpace(s) + s = strings.ToLower(s) + s = reExtraTitleInfo.ReplaceAllString(s, "") + s = reMultiSpace.ReplaceAllString(s, " ") + return s +} diff --git a/internal/similarity/similarity_test.go b/internal/similarity/similarity_test.go new file mode 100644 index 0000000..206c6f0 --- /dev/null +++ b/internal/similarity/similarity_test.go @@ -0,0 +1,51 @@ +/* +Copyright © 2024 Philipp Wolfer + +Scotty is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later version. + +Scotty is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +Scotty. If not, see . +*/ + +package similarity_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "go.uploadedlobster.com/scotty/internal/similarity" +) + +func ExampleSimilarity() { + s := similarity.Similarity("bar1", "bär1") + fmt.Println(s) + // Output: 0.75 +} + +func TestSimilarity(t *testing.T) { + assert := assert.New(t) + assert.Equal(1.0, similarity.Similarity("", "")) + assert.Equal(0.0, similarity.Similarity("foo", "")) + assert.Equal(0.0, similarity.Similarity("foo", "bar")) + assert.Equal(0.5, similarity.Similarity("foobar", "bar")) + assert.Equal(1.0, similarity.Similarity("foo", "foo")) +} + +func ExampleNormalizeTitle() { + s := similarity.NormalizeTitle(" Forever \tFailure (video edit) ") + fmt.Println(s) + // Output: forever failure +} + +func TestNormalizeTitle(t *testing.T) { + assert := assert.New(t) + assert.Equal("forever failure", similarity.NormalizeTitle("Forever Failure")) + assert.Equal("foo", similarity.NormalizeTitle(" \tfoo\t \t")) +}