230 lines
4.8 KiB
Go
230 lines
4.8 KiB
Go
/*
|
|
This script reads in GeoNames data and creates a table of IDs to city names and
|
|
timezones. The IDs are created from the ASCII city name, with administrative
|
|
division level 1 name and country code as disambiguation. Examples of IDs are:
|
|
|
|
Singapore-SG
|
|
Ban_Bueng-Chon_Buri-TH
|
|
Ashland-Oregon-US
|
|
Ashland-California-US
|
|
|
|
*/
|
|
package main
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"encoding/json"
|
|
"io"
|
|
"io/ioutil"
|
|
"log"
|
|
"os"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/hbollon/go-edlib"
|
|
"github.com/serverwentdown/datetime.link/data"
|
|
)
|
|
|
|
var regexName = regexp.MustCompile(`[^a-zA-Z1-9']+`)
|
|
|
|
func extendRef(refs ...string) string {
|
|
return strings.Join(refs, "-")
|
|
}
|
|
|
|
func normalizeName(name string) string {
|
|
simple := regexName.ReplaceAllString(name, "_")
|
|
trimmed := strings.Trim(simple, "_")
|
|
return trimmed
|
|
}
|
|
|
|
func splitNames(names string) []string {
|
|
return strings.Split(names, ",")
|
|
}
|
|
|
|
type stringLengthSort []string
|
|
|
|
func (p stringLengthSort) Len() int { return len(p) }
|
|
func (p stringLengthSort) Less(i, j int) bool { return len(p[i]) > len(p[j]) }
|
|
func (p stringLengthSort) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
|
|
|
func limitNames(primaryName string, names []string) ([]string, error) {
|
|
sort.Sort(stringLengthSort(names))
|
|
r := make([]string, 0, len(names))
|
|
for _, n := range names {
|
|
if n == primaryName || len(n) <= 0 {
|
|
continue
|
|
}
|
|
// Skip abbreviation-like names
|
|
if strings.ToUpper(n) == n {
|
|
continue
|
|
}
|
|
// Skip almost the same names
|
|
res, err := edlib.FuzzySearchThreshold(n, r, 0.82, edlib.Levenshtein)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(res) != 0 {
|
|
continue
|
|
}
|
|
// Skip substrings
|
|
skipSubstr := false
|
|
for _, longer := range r {
|
|
if strings.HasPrefix(longer, n) {
|
|
skipSubstr = true
|
|
}
|
|
}
|
|
if skipSubstr {
|
|
continue
|
|
}
|
|
// Limit
|
|
if len(r) > 10 {
|
|
break
|
|
}
|
|
r = append(r, n)
|
|
}
|
|
return r, nil
|
|
}
|
|
|
|
func readAdmin1Divisions(f string) (map[string]data.Admin1, error) {
|
|
file, err := os.Open(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r := csv.NewReader(file)
|
|
r.Comma = '\t'
|
|
r.Comment = '#'
|
|
|
|
m := make(map[string]data.Admin1)
|
|
for {
|
|
record, err := r.Read()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
code := record[0]
|
|
ref := normalizeName(record[2])
|
|
name := record[1]
|
|
m[code] = data.Admin1{
|
|
Code: code,
|
|
Ref: ref,
|
|
Name: name,
|
|
}
|
|
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func readCountries(f string) (map[string]data.Country, error) {
|
|
file, err := os.Open(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r := csv.NewReader(file)
|
|
r.Comma = '\t'
|
|
r.Comment = '#'
|
|
|
|
m := make(map[string]data.Country)
|
|
for {
|
|
record, err := r.Read()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
ref := record[0]
|
|
name := record[4]
|
|
m[ref] = data.Country{
|
|
Ref: ref,
|
|
Name: name,
|
|
}
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func readCities(f string, countries map[string]data.Country, admin1s map[string]data.Admin1) (map[string]*data.City, error) {
|
|
file, err := os.Open(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r := csv.NewReader(file)
|
|
r.Comma = '\t'
|
|
r.Comment = '#'
|
|
|
|
m := make(map[string]*data.City)
|
|
for {
|
|
record, err := r.Read()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
name := record[1]
|
|
ref := normalizeName(record[2])
|
|
alternateNames, err := limitNames(name, splitNames(record[3]))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
admin1Code := record[10]
|
|
countryRef := record[8]
|
|
population, err := strconv.ParseUint(record[14], 10, 64)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
timezone := record[17]
|
|
|
|
// Resolve Country and Admin1
|
|
country := countries[countryRef]
|
|
admin1 := admin1s[countryRef+"."+admin1Code]
|
|
|
|
// Bulid a full formed ID
|
|
eref := extendRef(ref, admin1.Ref, country.Ref)
|
|
if len(admin1.Ref) <= 0 {
|
|
eref = extendRef(ref, country.Ref)
|
|
}
|
|
|
|
c := &data.City{
|
|
Ref: ref,
|
|
Name: name,
|
|
AlternateNames: alternateNames,
|
|
Timezone: timezone,
|
|
Population: population,
|
|
Admin1: admin1,
|
|
Country: country,
|
|
}
|
|
|
|
// Warn if there exists a similar city
|
|
if e, ok := m[eref]; ok {
|
|
if !(e.Ref == c.Ref && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) {
|
|
|
|
log.Printf("WARNING: existing city %s: %v %v", eref, c, e)
|
|
}
|
|
}
|
|
|
|
m[eref] = c
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func main() {
|
|
admin1s, err := readAdmin1Divisions("../third-party/admin1CodesASCII.txt")
|
|
if err != nil {
|
|
log.Fatalf("Reading administrative divisions level 1 failed")
|
|
}
|
|
countries, err := readCountries("../third-party/countryInfo.txt")
|
|
if err != nil {
|
|
log.Fatalf("Reading countries failed")
|
|
}
|
|
cities, err := readCities("../third-party/cities15000.txt", countries, admin1s)
|
|
if err != nil {
|
|
log.Fatalf("Reading cities failed")
|
|
}
|
|
|
|
// Encode JSON file
|
|
b, err := json.Marshal(cities)
|
|
if err != nil {
|
|
log.Fatalf("Failed to encode: %v", err)
|
|
}
|
|
// Write JSON file
|
|
err = ioutil.WriteFile("../data/cities.json", b, 0644)
|
|
if err != nil {
|
|
log.Fatalf("Failed to write: %v", err)
|
|
}
|
|
}
|