1
0
Fork 0

Cleanup data with some edit distance tweaks

main
Ambrose Chua 2020-11-07 21:23:31 +08:00
parent bbecbd416b
commit 45ee517838
4 changed files with 65 additions and 25 deletions

View File

@ -25,7 +25,7 @@ test:
DATASETS = \ DATASETS = \
data/cities15000.txt \ data/cities5000.txt \
data/admin1CodesASCII.txt \ data/admin1CodesASCII.txt \
data/countryInfo.txt data/countryInfo.txt
@ -35,11 +35,11 @@ data: js/data.json
js/data.json: $(DATASETS) scripts/data.go js/data.json: $(DATASETS) scripts/data.go
cd scripts && $(GO) run data.go cd scripts && $(GO) run data.go
data/cities15000.txt: data/cities5000.txt:
$(MKDIR) data/ $(MKDIR) data/
$(DOWNLOAD) data/cities15000.zip http://download.geonames.org/export/dump/cities15000.zip $(DOWNLOAD) data/cities5000.zip http://download.geonames.org/export/dump/cities5000.zip
$(UNZIP) data/ data/cities15000.zip $(UNZIP) data/ data/cities5000.zip
$(RM) data/cities15000.zip $(RM) data/cities5000.zip
data/countryInfo.txt: data/countryInfo.txt:
$(MKDIR) data/ $(MKDIR) data/

2
go.mod
View File

@ -1,3 +1,5 @@
module github.com/serverwentdown/datetime.link module github.com/serverwentdown/datetime.link
go 1.14 go 1.14
require github.com/hbollon/go-edlib v1.3.1

2
go.sum Normal file
View File

@ -0,0 +1,2 @@
github.com/hbollon/go-edlib v1.3.1 h1:3x2Faq1xbShKhel5wEYyCNZFguh+s8GH75jdp8w6phU=
github.com/hbollon/go-edlib v1.3.1/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=

View File

@ -3,9 +3,10 @@ This script reads in GeoNames data and creates a table of IDs to city names and
timezones. The IDs are created from the ASCII city name, with administrative timezones. The IDs are created from the ASCII city name, with administrative
division level 1 name and country code as disambiguation. Examples of IDs are: division level 1 name and country code as disambiguation. Examples of IDs are:
Singapore Singapore-SG
Ashland_Oregon_US Ban_Bueng-Chon_Buri-TH
Ashland_Mississippi_US Ashland-Oregon-US
Ashland-California-US
*/ */
package main package main
@ -18,7 +19,10 @@ import (
"log" "log"
"os" "os"
"regexp" "regexp"
"sort"
"strings" "strings"
"github.com/hbollon/go-edlib"
) )
var regexName = regexp.MustCompile(`[^a-zA-Z1-9]+`) var regexName = regexp.MustCompile(`[^a-zA-Z1-9]+`)
@ -70,6 +74,50 @@ func splitNames(names string) []string {
return strings.Split(names, ",") return strings.Split(names, ",")
} }
type stringLengthSort []string
func (p stringLengthSort) Len() int { return len(p) }
func (p stringLengthSort) Less(i, j int) bool { return len(p[i]) > len(p[j]) }
func (p stringLengthSort) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func limitNames(primaryName string, names []string) []string {
sort.Sort(stringLengthSort(names))
r := make([]string, 0, len(names))
for _, n := range names {
if n == primaryName || len(n) <= 0 {
continue
}
// Skip abbreviation-like names
if strings.ToUpper(n) == n {
continue
}
// Skip almost the same names
res, err := edlib.FuzzySearchThreshold(n, r, 0.82, edlib.Levenshtein)
if err != nil {
log.Fatalf("Error doing fuzzy search: %v", err)
}
if len(res) != 0 {
continue
}
// Skip substrings
skipSubstr := false
for _, longer := range r {
if strings.HasPrefix(longer, n) {
skipSubstr = true
}
}
if skipSubstr {
continue
}
// Limit
if len(r) > 10 {
break
}
r = append(r, n)
}
return r
}
func extendRef(refs ...string) string { func extendRef(refs ...string) string {
return strings.Join(refs, "-") return strings.Join(refs, "-")
} }
@ -144,7 +192,7 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
} }
name := record[1] name := record[1]
ref := normalizeName(record[2]) ref := normalizeName(record[2])
alternateNames := splitNames(record[3]) alternateNames := limitNames(name, splitNames(record[3]))
admin1Code := record[10] admin1Code := record[10]
countryRef := record[8] countryRef := record[8]
timezone := record[17] timezone := record[17]
@ -155,22 +203,9 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
// Bulid a full formed ID // Bulid a full formed ID
eref := extendRef(ref, admin1.Ref, country.Ref) eref := extendRef(ref, admin1.Ref, country.Ref)
/*
if ref == admin1.Ref {
eref = extendRef(ref, country.Ref)
}
if admin1.Ref == country.Name {
eref = extendRef(ref, country.Ref)
}
*/
if len(admin1.Ref) <= 0 { if len(admin1.Ref) <= 0 {
eref = extendRef(ref, country.Ref) eref = extendRef(ref, country.Ref)
} }
/*
if ref == country.Name && len(admin1.Ref) <= 0 {
eref = extendRef(ref)
}
*/
c := &City{ c := &City{
Ref: ref, Ref: ref,
@ -183,7 +218,7 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
// Warn if there exists a similar city // Warn if there exists a similar city
if e, ok := m[eref]; ok { if e, ok := m[eref]; ok {
if !(e.Ref == c.Ref && e.Name == c.Name && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) { if !(e.Ref == c.Ref && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) {
log.Printf("WARNING: existing city %s: %v %v", eref, c, e) log.Printf("WARNING: existing city %s: %v %v", eref, c, e)
} }
@ -203,7 +238,7 @@ func main() {
if err != nil { if err != nil {
log.Fatalf("Reading countries failed") log.Fatalf("Reading countries failed")
} }
cities, err := readCities("../data/cities15000.txt", countries, admin1s) cities, err := readCities("../data/cities5000.txt", countries, admin1s)
if err != nil { if err != nil {
log.Fatalf("Reading cities failed") log.Fatalf("Reading cities failed")
} }
@ -213,7 +248,8 @@ func main() {
Cities: cities, Cities: cities,
} }
// Encode JSON file // Encode JSON file
b, err := json.MarshalIndent(data, " ", " ") //b, err := json.MarshalIndent(data, " ", " ")
b, err := json.Marshal(data)
if err != nil { if err != nil {
log.Fatalf("Failed to encode: %v", err) log.Fatalf("Failed to encode: %v", err)
} }