1
0
Fork 0

Use maps to flatten data

main
Ambrose Chua 2020-11-07 20:45:00 +08:00
parent c481a9ffa2
commit bbecbd416b
2 changed files with 159 additions and 83 deletions

View File

@ -33,7 +33,7 @@ DATASETS = \
data: js/data.json
js/data.json: $(DATASETS) scripts/data.go
$(GO) run scripts/data.go
cd scripts && $(GO) run data.go
data/cities15000.txt:
$(MKDIR) data/

View File

@ -21,129 +21,205 @@ import (
"strings"
)
var regexName = regexp.MustCompile(`[^a-zA-Z1-9]+`)
// Data is all the data needed to map cities to timezones
type Data struct {
Cities map[string]*City
}
// City represents a city that belongs inside an administrative division level 1
// and a country
type City struct {
// Ref is the ASCII name of the city
Ref string `json:"r"`
Ref string `json:"-"`
// Name is the full UTF-8 name of the city
Name string `json:"n"`
AlternateNames []string `json:"an"`
Timezone string `json:"t"`
// Admin1Ref is the ASCII name of the administrative division level 1
Admin1Ref string `json:"a1r"`
Admin1Name string `json:"a1n"`
Admin1 Admin1 `json:"a1"`
Country Country `json:"c"`
}
// Admin1 represents an administrative division level 1
type Admin1 struct {
// Code is the administrative division level 1 identifier, usually ISO-3166
Code string `json:"-"`
// Ref is the ASCII name of the administrative division level 1
Ref string `json:"-"`
// Name is the full UTF-8 name of the division
Name string `json:"n"`
}
// Country represents a country
type Country struct {
// CountryRef is the ISO-3166 2-letter country code
CountryRef string `json:"cr"`
CountryName string `json:"cn"`
Ref string `json:"-"`
// Name is the full UTF-8 name of the country
Name string `json:"n"`
}
func main() {
func normalizeName(name string) string {
simple := regexName.ReplaceAllString(name, "_")
trimmed := strings.Trim(simple, "_")
return trimmed
}
/*
func splitNames(names string) []string {
return strings.Split(names, ",")
}
func main() {
// Read CSV data
citiesFile, err := os.Open("data/cities15000.txt")
func extendRef(refs ...string) string {
return strings.Join(refs, "-")
}
func readAdmin1Divisions(f string) (map[string]Admin1, error) {
file, err := os.Open(f)
if err != nil {
log.Fatalf("Opening file failed: %v", err)
return nil, err
}
r := csv.NewReader(citiesFile)
r := csv.NewReader(file)
r.Comma = '\t'
r.Comment = '#'
// Track collisions
collisions := make(map[string]bool)
// Pick out useful information
cities := make(map[string]City)
m := make(map[string]Admin1)
for {
record, err := r.Read()
if err == io.EOF {
break
}
if err != nil {
log.Fatalf("Unable to read CSV: %v", err)
}
key, city := CityFromRecord(record)
// TODO: Reimplement collision rewriter
// Remap collisions
if _, ok := collisions[key]; ok {
key = key + "_" + city.Admin1Code + "_" + city.CountryCode
code := record[0]
ref := normalizeName(record[2])
name := record[1]
m[code] = Admin1{
Code: code,
Ref: ref,
Name: name,
}
// Check for collisions
if existing, ok := cities[key]; ok {
if existing.CountryCode == city.CountryCode {
log.Printf("Warning: Repeat entry with same country code for %s (please compare %s with %s)", key, city.Timezone, existing.Timezone)
} else if existing.Timezone == city.Timezone {
log.Printf("Warning: Repeat entry with same timezone for %s", key)
} else {
log.Printf("Warning: Collision entry found for %s. Rewriting (%s but there is %s)", key, city.CountryCode, existing.CountryCode)
cities[key+"_"+existing.Admin1Code+"_"+existing.CountryCode] = existing
delete(cities, "key")
collisions[key] = true
}
return m, nil
}
func readCountries(f string) (map[string]Country, error) {
file, err := os.Open(f)
if err != nil {
return nil, err
}
r := csv.NewReader(file)
r.Comma = '\t'
r.Comment = '#'
m := make(map[string]Country)
for {
record, err := r.Read()
if err == io.EOF {
break
}
ref := record[0]
name := record[4]
m[ref] = Country{
Ref: ref,
Name: name,
}
}
return m, nil
}
func readCities(f string, countries map[string]Country, admin1s map[string]Admin1) (map[string]*City, error) {
file, err := os.Open(f)
if err != nil {
return nil, err
}
r := csv.NewReader(file)
r.Comma = '\t'
r.Comment = '#'
m := make(map[string]*City)
for {
record, err := r.Read()
if err == io.EOF {
break
}
name := record[1]
ref := normalizeName(record[2])
alternateNames := splitNames(record[3])
admin1Code := record[10]
countryRef := record[8]
timezone := record[17]
// Resolve Country and Admin1
country := countries[countryRef]
admin1 := admin1s[countryRef+"."+admin1Code]
// Bulid a full formed ID
eref := extendRef(ref, admin1.Ref, country.Ref)
/*
if ref == admin1.Ref {
eref = extendRef(ref, country.Ref)
}
if admin1.Ref == country.Name {
eref = extendRef(ref, country.Ref)
}
*/
if len(admin1.Ref) <= 0 {
eref = extendRef(ref, country.Ref)
}
/*
if ref == country.Name && len(admin1.Ref) <= 0 {
eref = extendRef(ref)
}
*/
c := &City{
Ref: ref,
Name: name,
AlternateNames: alternateNames,
Timezone: timezone,
Admin1: admin1,
Country: country,
}
// Warn if there exists a similar city
if e, ok := m[eref]; ok {
if !(e.Ref == c.Ref && e.Name == c.Name && e.Admin1.Ref == c.Admin1.Ref && e.Country.Ref == e.Country.Ref) {
log.Printf("WARNING: existing city %s: %v %v", eref, c, e)
}
}
cities[key] = city
m[eref] = c
}
return m, nil
}
func main() {
admin1s, err := readAdmin1Divisions("../data/admin1CodesASCII.txt")
if err != nil {
log.Fatalf("Reading administrative divisions level 1 failed")
}
countries, err := readCountries("../data/countryInfo.txt")
if err != nil {
log.Fatalf("Reading countries failed")
}
cities, err := readCities("../data/cities15000.txt", countries, admin1s)
if err != nil {
log.Fatalf("Reading cities failed")
}
// Group data
data := Data{
Cities: cities,
}
// Encode JSON file
b, err := json.Marshal(data)
b, err := json.MarshalIndent(data, " ", " ")
if err != nil {
log.Fatalf("Failed to encode: %v", err)
}
// Write JSON file
err = ioutil.WriteFile("js/data.json", b, 0644)
err = ioutil.WriteFile("../js/data.json", b, 0644)
if err != nil {
log.Fatalf("Failed to write: %v", err)
}
}
type Data struct {
Cities map[string]City
}
type City struct {
Names []string `json:"n"`
Admin1Code string `json:"a"`
CountryCode string `json:"c"`
Timezone string `json:"t"`
}
// TODO: might be better to have IDs be City_Administrative_SG and the City struct having Names, Administrative, Country as pure text
func CityFromRecord(record []string) (string, City) {
name := normalizeName(record[2])
names := splitNames(record[3])
admin1Code := record[10]
countryCode := record[8]
timezone := record[17]
return name, City{
Names: names,
Admin1Code: admin1Code,
CountryCode: countryCode,
Timezone: timezone,
}
}
var re = regexp.MustCompile(`[^a-zA-Z1-9]`)
func normalizeName(name string) string {
return re.ReplaceAllString(name, "_")
}
func splitNames(names string) []string {
return strings.Split(names, ",")
}
*/