Cleanup data with some edit distance tweaks
parent
bbecbd416b
commit
45ee517838
10
Makefile
10
Makefile
|
@ -25,7 +25,7 @@ test:
|
||||||
|
|
||||||
|
|
||||||
DATASETS = \
|
DATASETS = \
|
||||||
data/cities15000.txt \
|
data/cities5000.txt \
|
||||||
data/admin1CodesASCII.txt \
|
data/admin1CodesASCII.txt \
|
||||||
data/countryInfo.txt
|
data/countryInfo.txt
|
||||||
|
|
||||||
|
@ -35,11 +35,11 @@ data: js/data.json
|
||||||
js/data.json: $(DATASETS) scripts/data.go
|
js/data.json: $(DATASETS) scripts/data.go
|
||||||
cd scripts && $(GO) run data.go
|
cd scripts && $(GO) run data.go
|
||||||
|
|
||||||
data/cities15000.txt:
|
data/cities5000.txt:
|
||||||
$(MKDIR) data/
|
$(MKDIR) data/
|
||||||
$(DOWNLOAD) data/cities15000.zip http://download.geonames.org/export/dump/cities15000.zip
|
$(DOWNLOAD) data/cities5000.zip http://download.geonames.org/export/dump/cities5000.zip
|
||||||
$(UNZIP) data/ data/cities15000.zip
|
$(UNZIP) data/ data/cities5000.zip
|
||||||
$(RM) data/cities15000.zip
|
$(RM) data/cities5000.zip
|
||||||
|
|
||||||
data/countryInfo.txt:
|
data/countryInfo.txt:
|
||||||
$(MKDIR) data/
|
$(MKDIR) data/
|
||||||
|
|
2
go.mod
2
go.mod
|
@ -1,3 +1,5 @@
|
||||||
module github.com/serverwentdown/datetime.link
|
module github.com/serverwentdown/datetime.link
|
||||||
|
|
||||||
go 1.14
|
go 1.14
|
||||||
|
|
||||||
|
require github.com/hbollon/go-edlib v1.3.1
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
github.com/hbollon/go-edlib v1.3.1 h1:3x2Faq1xbShKhel5wEYyCNZFguh+s8GH75jdp8w6phU=
|
||||||
|
github.com/hbollon/go-edlib v1.3.1/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
|
|
@ -3,9 +3,10 @@ This script reads in GeoNames data and creates a table of IDs to city names and
|
||||||
timezones. The IDs are created from the ASCII city name, with administrative
|
timezones. The IDs are created from the ASCII city name, with administrative
|
||||||
division level 1 name and country code as disambiguation. Examples of IDs are:
|
division level 1 name and country code as disambiguation. Examples of IDs are:
|
||||||
|
|
||||||
Singapore
|
Singapore-SG
|
||||||
Ashland_Oregon_US
|
Ban_Bueng-Chon_Buri-TH
|
||||||
Ashland_Mississippi_US
|
Ashland-Oregon-US
|
||||||
|
Ashland-California-US
|
||||||
|
|
||||||
*/
|
*/
|
||||||
package main
|
package main
|
||||||
|
@ -18,7 +19,10 @@ import (
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/hbollon/go-edlib"
|
||||||
)
|
)
|
||||||
|
|
||||||
var regexName = regexp.MustCompile(`[^a-zA-Z1-9]+`)
|
var regexName = regexp.MustCompile(`[^a-zA-Z1-9]+`)
|
||||||
|
@ -70,6 +74,50 @@ func splitNames(names string) []string {
|
||||||
return strings.Split(names, ",")
|
return strings.Split(names, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type stringLengthSort []string
|
||||||
|
|
||||||
|
func (p stringLengthSort) Len() int { return len(p) }
|
||||||
|
func (p stringLengthSort) Less(i, j int) bool { return len(p[i]) > len(p[j]) }
|
||||||
|
func (p stringLengthSort) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
||||||
|
|
||||||
|
func limitNames(primaryName string, names []string) []string {
|
||||||
|
sort.Sort(stringLengthSort(names))
|
||||||
|
r := make([]string, 0, len(names))
|
||||||
|
for _, n := range names {
|
||||||
|
if n == primaryName || len(n) <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Skip abbreviation-like names
|
||||||
|
if strings.ToUpper(n) == n {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Skip almost the same names
|
||||||
|
res, err := edlib.FuzzySearchThreshold(n, r, 0.82, edlib.Levenshtein)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Error doing fuzzy search: %v", err)
|
||||||
|
}
|
||||||
|
if len(res) != 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Skip substrings
|
||||||
|
skipSubstr := false
|
||||||
|
for _, longer := range r {
|
||||||
|
if strings.HasPrefix(longer, n) {
|
||||||
|
skipSubstr = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if skipSubstr {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Limit
|
||||||
|
if len(r) > 10 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
r = append(r, n)
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
func extendRef(refs ...string) string {
|
func extendRef(refs ...string) string {
|
||||||
return strings.Join(refs, "-")
|
return strings.Join(refs, "-")
|
||||||
}
|
}
|
||||||
|
@ -144,7 +192,7 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
|
||||||
}
|
}
|
||||||
name := record[1]
|
name := record[1]
|
||||||
ref := normalizeName(record[2])
|
ref := normalizeName(record[2])
|
||||||
alternateNames := splitNames(record[3])
|
alternateNames := limitNames(name, splitNames(record[3]))
|
||||||
admin1Code := record[10]
|
admin1Code := record[10]
|
||||||
countryRef := record[8]
|
countryRef := record[8]
|
||||||
timezone := record[17]
|
timezone := record[17]
|
||||||
|
@ -155,22 +203,9 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
|
||||||
|
|
||||||
// Bulid a full formed ID
|
// Bulid a full formed ID
|
||||||
eref := extendRef(ref, admin1.Ref, country.Ref)
|
eref := extendRef(ref, admin1.Ref, country.Ref)
|
||||||
/*
|
|
||||||
if ref == admin1.Ref {
|
|
||||||
eref = extendRef(ref, country.Ref)
|
|
||||||
}
|
|
||||||
if admin1.Ref == country.Name {
|
|
||||||
eref = extendRef(ref, country.Ref)
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
if len(admin1.Ref) <= 0 {
|
if len(admin1.Ref) <= 0 {
|
||||||
eref = extendRef(ref, country.Ref)
|
eref = extendRef(ref, country.Ref)
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
if ref == country.Name && len(admin1.Ref) <= 0 {
|
|
||||||
eref = extendRef(ref)
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
c := &City{
|
c := &City{
|
||||||
Ref: ref,
|
Ref: ref,
|
||||||
|
@ -183,7 +218,7 @@ func readCities(f string, countries map[string]Country, admin1s map[string]Admin
|
||||||
|
|
||||||
// Warn if there exists a similar city
|
// Warn if there exists a similar city
|
||||||
if e, ok := m[eref]; ok {
|
if e, ok := m[eref]; ok {
|
||||||
if !(e.Ref == c.Ref && e.Name == c.Name && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) {
|
if !(e.Ref == c.Ref && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) {
|
||||||
|
|
||||||
log.Printf("WARNING: existing city %s: %v %v", eref, c, e)
|
log.Printf("WARNING: existing city %s: %v %v", eref, c, e)
|
||||||
}
|
}
|
||||||
|
@ -203,7 +238,7 @@ func main() {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Reading countries failed")
|
log.Fatalf("Reading countries failed")
|
||||||
}
|
}
|
||||||
cities, err := readCities("../data/cities15000.txt", countries, admin1s)
|
cities, err := readCities("../data/cities5000.txt", countries, admin1s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Reading cities failed")
|
log.Fatalf("Reading cities failed")
|
||||||
}
|
}
|
||||||
|
@ -213,7 +248,8 @@ func main() {
|
||||||
Cities: cities,
|
Cities: cities,
|
||||||
}
|
}
|
||||||
// Encode JSON file
|
// Encode JSON file
|
||||||
b, err := json.MarshalIndent(data, " ", " ")
|
//b, err := json.MarshalIndent(data, " ", " ")
|
||||||
|
b, err := json.Marshal(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Failed to encode: %v", err)
|
log.Fatalf("Failed to encode: %v", err)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue