Cleanup data with some edit distance tweaks
parent
bbecbd416b
commit
45ee517838
10
Makefile
10
Makefile
|
@ -25,7 +25,7 @@ test:
|
|||
|
||||
|
||||
DATASETS = \
|
||||
data/cities15000.txt \
|
||||
data/cities5000.txt \
|
||||
data/admin1CodesASCII.txt \
|
||||
data/countryInfo.txt
|
||||
|
||||
|
@ -35,11 +35,11 @@ data: js/data.json
|
|||
js/data.json: $(DATASETS) scripts/data.go
|
||||
cd scripts && $(GO) run data.go
|
||||
|
||||
data/cities15000.txt:
|
||||
data/cities5000.txt:
|
||||
$(MKDIR) data/
|
||||
$(DOWNLOAD) data/cities15000.zip http://download.geonames.org/export/dump/cities15000.zip
|
||||
$(UNZIP) data/ data/cities15000.zip
|
||||
$(RM) data/cities15000.zip
|
||||
$(DOWNLOAD) data/cities5000.zip http://download.geonames.org/export/dump/cities5000.zip
|
||||
$(UNZIP) data/ data/cities5000.zip
|
||||
$(RM) data/cities5000.zip
|
||||
|
||||
data/countryInfo.txt:
|
||||
$(MKDIR) data/
|
||||
|
|
2
go.mod
2
go.mod
|
@ -1,3 +1,5 @@
|
|||
module github.com/serverwentdown/datetime.link
|
||||
|
||||
go 1.14
|
||||
|
||||
require github.com/hbollon/go-edlib v1.3.1
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
github.com/hbollon/go-edlib v1.3.1 h1:3x2Faq1xbShKhel5wEYyCNZFguh+s8GH75jdp8w6phU=
|
||||
github.com/hbollon/go-edlib v1.3.1/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
|
|
@ -3,9 +3,10 @@ This script reads in GeoNames data and creates a table of IDs to city names and
|
|||
timezones. The IDs are created from the ASCII city name, with administrative
|
||||
division level 1 name and country code as disambiguation. Examples of IDs are:
|
||||
|
||||
Singapore
|
||||
Ashland_Oregon_US
|
||||
Ashland_Mississippi_US
|
||||
Singapore-SG
|
||||
Ban_Bueng-Chon_Buri-TH
|
||||
Ashland-Oregon-US
|
||||
Ashland-California-US
|
||||
|
||||
*/
|
||||
package main
|
||||
|
@ -18,7 +19,10 @@ import (
|
|||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/hbollon/go-edlib"
|
||||
)
|
||||
|
||||
var regexName = regexp.MustCompile(`[^a-zA-Z1-9]+`)
|
||||
|
@ -70,6 +74,50 @@ func splitNames(names string) []string {
|
|||
return strings.Split(names, ",")
|
||||
}
|
||||
|
||||
type stringLengthSort []string
|
||||
|
||||
func (p stringLengthSort) Len() int { return len(p) }
|
||||
func (p stringLengthSort) Less(i, j int) bool { return len(p[i]) > len(p[j]) }
|
||||
func (p stringLengthSort) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
||||
|
||||
func limitNames(primaryName string, names []string) []string {
|
||||
sort.Sort(stringLengthSort(names))
|
||||
r := make([]string, 0, len(names))
|
||||
for _, n := range names {
|
||||
if n == primaryName || len(n) <= 0 {
|
||||
continue
|
||||
}
|
||||
// Skip abbreviation-like names
|
||||
if strings.ToUpper(n) == n {
|
||||
continue
|
||||
}
|
||||
// Skip almost the same names
|
||||
res, err := edlib.FuzzySearchThreshold(n, r, 0.82, edlib.Levenshtein)
|
||||
if err != nil {
|
||||
log.Fatalf("Error doing fuzzy search: %v", err)
|
||||
}
|
||||
if len(res) != 0 {
|
||||
continue
|
||||
}
|
||||
// Skip substrings
|
||||
skipSubstr := false
|
||||
for _, longer := range r {
|
||||
if strings.HasPrefix(longer, n) {
|
||||
skipSubstr = true
|
||||
}
|
||||
}
|
||||
if skipSubstr {
|
||||
continue
|
||||
}
|
||||
// Limit
|
||||
if len(r) > 10 {
|
||||
break
|
||||
}
|
||||
r = append(r, n)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func extendRef(refs ...string) string {
|
||||
return strings.Join(refs, "-")
|
||||
}
|
||||
|
@ -144,7 +192,7 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
|
|||
}
|
||||
name := record[1]
|
||||
ref := normalizeName(record[2])
|
||||
alternateNames := splitNames(record[3])
|
||||
alternateNames := limitNames(name, splitNames(record[3]))
|
||||
admin1Code := record[10]
|
||||
countryRef := record[8]
|
||||
timezone := record[17]
|
||||
|
@ -155,22 +203,9 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
|
|||
|
||||
// Bulid a full formed ID
|
||||
eref := extendRef(ref, admin1.Ref, country.Ref)
|
||||
/*
|
||||
if ref == admin1.Ref {
|
||||
eref = extendRef(ref, country.Ref)
|
||||
}
|
||||
if admin1.Ref == country.Name {
|
||||
eref = extendRef(ref, country.Ref)
|
||||
}
|
||||
*/
|
||||
if len(admin1.Ref) <= 0 {
|
||||
eref = extendRef(ref, country.Ref)
|
||||
}
|
||||
/*
|
||||
if ref == country.Name && len(admin1.Ref) <= 0 {
|
||||
eref = extendRef(ref)
|
||||
}
|
||||
*/
|
||||
|
||||
c := &City{
|
||||
Ref: ref,
|
||||
|
@ -183,7 +218,7 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
|
|||
|
||||
// Warn if there exists a similar city
|
||||
if e, ok := m[eref]; ok {
|
||||
if !(e.Ref == c.Ref && e.Name == c.Name && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) {
|
||||
if !(e.Ref == c.Ref && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) {
|
||||
|
||||
log.Printf("WARNING: existing city %s: %v %v", eref, c, e)
|
||||
}
|
||||
|
@ -203,7 +238,7 @@ func main() {
|
|||
if err != nil {
|
||||
log.Fatalf("Reading countries failed")
|
||||
}
|
||||
cities, err := readCities("../data/cities15000.txt", countries, admin1s)
|
||||
cities, err := readCities("../data/cities5000.txt", countries, admin1s)
|
||||
if err != nil {
|
||||
log.Fatalf("Reading cities failed")
|
||||
}
|
||||
|
@ -213,7 +248,8 @@ func main() {
|
|||
Cities: cities,
|
||||
}
|
||||
// Encode JSON file
|
||||
b, err := json.MarshalIndent(data, " ", " ")
|
||||
//b, err := json.MarshalIndent(data, " ", " ")
|
||||
b, err := json.Marshal(data)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to encode: %v", err)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue