diff --git a/counterexamples/divisions/geographic_area/bad-class.yaml b/counterexamples/divisions/geographic_area/bad-class.yaml new file mode 100644 index 000000000..caa7728b7 --- /dev/null +++ b/counterexamples/divisions/geographic_area/bad-class.yaml @@ -0,0 +1,16 @@ +--- +id: counterexample:geography:bad-class +type: Feature +geometry: + type: Polygon + coordinates: [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: regional + names: + primary: A cultural geography with an invalid class. + ext_expected_errors: + - "value must be one of 'colloquial', 'postal'" diff --git a/counterexamples/divisions/geographic_area/bad-geometry-type.yaml b/counterexamples/divisions/geographic_area/bad-geometry-type.yaml new file mode 100644 index 000000000..d8dda06cb --- /dev/null +++ b/counterexamples/divisions/geographic_area/bad-geometry-type.yaml @@ -0,0 +1,17 @@ +--- +id: counterexample:geography:bad-geometry +type: Feature +geometry: + type: Point + coordinates: [0, 0] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: functional + class: postal + names: + primary: A geography with invalid Point geometry (must be Polygon or MultiPolygon). + ext_expected_errors: + - "/properties/type/enum]: value must be 'Polygon'" + - "/properties/type/enum]: value must be 'MultiPolygon'" diff --git a/counterexamples/divisions/geographic_area/bad-subtype.yaml b/counterexamples/divisions/geographic_area/bad-subtype.yaml new file mode 100644 index 000000000..591b4402d --- /dev/null +++ b/counterexamples/divisions/geographic_area/bad-subtype.yaml @@ -0,0 +1,16 @@ +--- +id: counterexample:geography:bad-subtype +type: Feature +geometry: + type: Polygon + coordinates: [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: administrative + class: colloquial + names: + primary: A geography with an invalid subtype. + ext_expected_errors: + - "value must be one of 'functional', 'cultural'" diff --git a/counterexamples/divisions/geographic_area/colloquial-functional-mismatch.yaml b/counterexamples/divisions/geographic_area/colloquial-functional-mismatch.yaml new file mode 100644 index 000000000..d066c9955 --- /dev/null +++ b/counterexamples/divisions/geographic_area/colloquial-functional-mismatch.yaml @@ -0,0 +1,16 @@ +--- +id: counterexample:geography:colloquial-functional-mismatch +type: Feature +geometry: + type: Polygon + coordinates: [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: functional + class: colloquial + names: + primary: A functional geography with colloquial class (only allowed for cultural). + ext_expected_errors: + - "value must be 'cultural'" diff --git a/counterexamples/divisions/geographic_area/missing-class-cultural.yaml b/counterexamples/divisions/geographic_area/missing-class-cultural.yaml new file mode 100644 index 000000000..51900edfd --- /dev/null +++ b/counterexamples/divisions/geographic_area/missing-class-cultural.yaml @@ -0,0 +1,15 @@ +--- +id: counterexample:geography:missing-class +type: Feature +geometry: + type: Polygon + coordinates: [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + names: + primary: A geography missing required class property. + ext_expected_errors: + - "missing property 'class'" diff --git a/counterexamples/divisions/geographic_area/missing-class-functional.yaml b/counterexamples/divisions/geographic_area/missing-class-functional.yaml new file mode 100644 index 000000000..b759395dd --- /dev/null +++ b/counterexamples/divisions/geographic_area/missing-class-functional.yaml @@ -0,0 +1,15 @@ +--- +id: counterexample:geography:missing-class-functional +type: Feature +geometry: + type: Polygon + coordinates: [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: functional + names: + primary: A functional geography missing required class property. + ext_expected_errors: + - "missing property 'class'" diff --git a/counterexamples/divisions/geographic_area/negative-population.yaml b/counterexamples/divisions/geographic_area/negative-population.yaml new file mode 100644 index 000000000..09e3f9bca --- /dev/null +++ b/counterexamples/divisions/geographic_area/negative-population.yaml @@ -0,0 +1,17 @@ +--- +id: counterexample:geography:negative-population +type: Feature +geometry: + type: Polygon + coordinates: [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: colloquial + names: + primary: A geography with negative population. + population: -1000 + ext_expected_errors: + - "minimum: got -1,000, want 0" diff --git a/counterexamples/divisions/geographic_area/postal-cultural-mismatch.yaml b/counterexamples/divisions/geographic_area/postal-cultural-mismatch.yaml new file mode 100644 index 000000000..60998a533 --- /dev/null +++ b/counterexamples/divisions/geographic_area/postal-cultural-mismatch.yaml @@ -0,0 +1,16 @@ +--- +id: counterexample:geography:postal-cultural-mismatch +type: Feature +geometry: + type: Polygon + coordinates: [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: postal + names: + primary: A cultural geography with postal class (only allowed for functional). + ext_expected_errors: + - "value must be 'functional'" diff --git a/docs/schema/reference/divisions/geographic_area.mdx b/docs/schema/reference/divisions/geographic_area.mdx new file mode 100644 index 000000000..d0c85b49b --- /dev/null +++ b/docs/schema/reference/divisions/geographic_area.mdx @@ -0,0 +1,117 @@ +--- +title: geographic_area +--- + +import CodeBlock from '@theme/CodeBlock'; +import JSONSchemaViewer from "@theme/JSONSchemaViewer"; +import generateResolverOptions from "@site/src/components/shared-libs/generateResolverOptions" +import yamlLoad from "@site/src/components/yamlLoad" +import divisions_geographic_area_schema from '!!raw-loader!@site/docs/_schema/divisions/geographic_area.yaml'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import geographic_area_example_cultural_bay_area from '!!raw-loader!@site/docs/_examples/divisions/geographic_area/cultural_colloquial_bay_area.yaml'; +import geographic_area_example_cultural_wine_country from '!!raw-loader!@site/docs/_examples/divisions/geographic_area/cultural_colloquial_wine_country.yaml'; +import geographic_area_example_functional_zip from '!!raw-loader!@site/docs/_examples/divisions/geographic_area/functional_postal_zip_code.yaml'; + + +# Geographic area + +A geographic area is a functional or cultural region that may span across multiple administrative divisions. These regions capture areas defined by shared characteristics, usage patterns, or cultural identity rather than formal administrative boundaries. + + + + + + + + + + + + + + + + + +
Geometry TypePolygon or MultiPolygon
Themedivisions
Typegeographic_area
+ +## Subtypes + +Geographic areas are categorized into two main subtypes: + + + + + + + + + + +
subtypefunctional
+ +Functional regions are defined by functional characteristics or usage patterns, such as postal code regions or economic zones. These areas serve specific operational or administrative purposes. + +

Classes:

+ +
+ + + + + + + + + +
subtypecultural
+ +Cultural regions are defined by cultural identity, colloquial usage, or shared cultural characteristics. These areas reflect how people commonly refer to regions in everyday language. + +

Classes:

+ + +
+
+ +## Properties + +Key properties of geographic areas include: + +- **names** - Primary name and optional translations +- **subtype** - Either `functional` or `cultural` +- **class** - Required classification (e.g., `colloquial` for cultural, `postal` for functional) +- **associated_division_ids** - Optional list of division IDs that make up the geographic area +- **population** - Optional population count if inferable +- **cartography** - Cartographic hints including prominence for map display +- **wikidata** - Optional Wikidata ID reference + +## Schema + + + + + + + {divisions_geographic_area_schema} + + + +## Examples + + + + { JSON.stringify(yamlLoad(geographic_area_example_cultural_bay_area), null, 2) } + + + { JSON.stringify(yamlLoad(geographic_area_example_cultural_wine_country), null, 2) } + + + { JSON.stringify(yamlLoad(geographic_area_example_functional_zip), null, 2) } + + diff --git a/examples/divisions/geographic_area/cultural_colloquial_bay_area.yaml b/examples/divisions/geographic_area/cultural_colloquial_bay_area.yaml new file mode 100644 index 000000000..f74797b7f --- /dev/null +++ b/examples/divisions/geographic_area/cultural_colloquial_bay_area.yaml @@ -0,0 +1,48 @@ +--- +id: example:geography:cultural:bay_area +type: Feature +geometry: + type: Polygon + coordinates: [ + [ + [-123.0, 37.3], + [-121.5, 37.3], + [-121.5, 38.5], + [-123.0, 38.5], + [-123.0, 37.3] + ] + ] +bbox: [-123.0, 37.3, -121.5, 38.5] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: colloquial + names: + primary: San Francisco Bay Area + common: + en: Bay Area + es: Área de la Bahía + zh: 舊金山灣區 + cartography: + prominence: 75 + min_zoom: 6 + max_zoom: 12 + associated_division_ids: + - example:division:locality:san_francisco + - example:division:locality:oakland + - example:division:locality:san_jose + - example:division:county:alameda + - example:division:county:contra_costa + - example:division:county:marin + - example:division:county:san_francisco + - example:division:county:san_mateo + - example:division:county:santa_clara + population: 7753000 + wikidata: Q213205 + sources: + - property: "" + dataset: OpenStreetMap + record_id: relation/111968 + update_time: "2023-11-15T10:20:30Z" diff --git a/examples/divisions/geographic_area/cultural_colloquial_east_asia.yaml b/examples/divisions/geographic_area/cultural_colloquial_east_asia.yaml new file mode 100644 index 000000000..7f82dd68f --- /dev/null +++ b/examples/divisions/geographic_area/cultural_colloquial_east_asia.yaml @@ -0,0 +1,48 @@ +--- +id: example:geography:cultural:east_asia +type: Feature +geometry: + type: Polygon + coordinates: [ + [ + [100.0, 20.0], + [145.0, 20.0], + [145.0, 50.0], + [100.0, 50.0], + [100.0, 20.0] + ] + ] +bbox: [100.0, 20.0, 145.0, 50.0] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: colloquial + names: + primary: East Asia + common: + en: East Asia + es: Asia Oriental + fr: Asie de l'Est + zh: 東亞 + zh-Hans: 东亚 + ja: 東アジア + ko: 동아시아 + cartography: + prominence: 85 + min_zoom: 3 + max_zoom: 8 + associated_division_ids: + - example:division:country:cn + - example:division:country:jp + - example:division:country:kr + - example:division:country:kp + - example:division:country:mn + - example:division:country:tw + population: 1677000000 + wikidata: Q27231 + sources: + - property: "" + dataset: OpenStreetMap + record_id: relation/345678 diff --git a/examples/divisions/geographic_area/cultural_colloquial_new_england.yaml b/examples/divisions/geographic_area/cultural_colloquial_new_england.yaml new file mode 100644 index 000000000..d1ba0f030 --- /dev/null +++ b/examples/divisions/geographic_area/cultural_colloquial_new_england.yaml @@ -0,0 +1,45 @@ +--- +id: example:geography:cultural:new_england +type: Feature +geometry: + type: Polygon + coordinates: [ + [ + [-73.5, 41.0], + [-69.5, 41.0], + [-69.5, 47.5], + [-73.5, 47.5], + [-73.5, 41.0] + ] + ] +bbox: [-73.5, 41.0, -69.5, 47.5] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: colloquial + names: + primary: New England + common: + en: New England + es: Nueva Inglaterra + fr: Nouvelle-Angleterre + pt: Nova Inglaterra + cartography: + prominence: 70 + min_zoom: 5 + max_zoom: 10 + associated_division_ids: + - example:division:region:connecticut + - example:division:region:maine + - example:division:region:massachusetts + - example:division:region:new_hampshire + - example:division:region:rhode_island + - example:division:region:vermont + population: 15116205 + wikidata: Q18389 + sources: + - property: "" + dataset: OpenStreetMap + record_id: relation/60759 diff --git a/examples/divisions/geographic_area/cultural_colloquial_south_florida.yaml b/examples/divisions/geographic_area/cultural_colloquial_south_florida.yaml new file mode 100644 index 000000000..4d08ad037 --- /dev/null +++ b/examples/divisions/geographic_area/cultural_colloquial_south_florida.yaml @@ -0,0 +1,41 @@ +--- +id: example:geography:cultural:south_florida +type: Feature +geometry: + type: Polygon + coordinates: [ + [ + [-81.5, 25.0], + [-79.8, 25.0], + [-79.8, 27.0], + [-81.5, 27.0], + [-81.5, 25.0] + ] + ] +bbox: [-81.5, 25.0, -79.8, 27.0] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: colloquial + names: + primary: South Florida + common: + en: South Florida + es: Sur de Florida + cartography: + prominence: 65 + min_zoom: 7 + max_zoom: 11 + associated_division_ids: + - example:division:county:miami_dade + - example:division:county:broward + - example:division:county:palm_beach + - example:division:county:monroe + population: 6198782 + wikidata: Q1352730 + sources: + - property: "" + dataset: OpenStreetMap + record_id: relation/234567 diff --git a/examples/divisions/geographic_area/cultural_colloquial_wine_country.yaml b/examples/divisions/geographic_area/cultural_colloquial_wine_country.yaml new file mode 100644 index 000000000..1d36209f2 --- /dev/null +++ b/examples/divisions/geographic_area/cultural_colloquial_wine_country.yaml @@ -0,0 +1,36 @@ +--- +id: example:geography:cultural:california_wine_country +type: Feature +geometry: + type: Polygon + coordinates: [ + [ + [-123.5, 38.0], + [-122.0, 38.0], + [-122.0, 39.0], + [-123.5, 39.0], + [-123.5, 38.0] + ] + ] +bbox: [-123.5, 38.0, -122.0, 39.0] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: cultural + class: colloquial + names: + primary: California Wine Country + common: + en: Wine Country + es: Región Vinícola de California + cartography: + prominence: 50 + min_zoom: 8 + max_zoom: 13 + wikidata: Q3303756 + sources: + - property: "" + dataset: OpenStreetMap + record_id: relation/123456 + update_time: "2023-09-10T08:15:00Z" diff --git a/examples/divisions/geographic_area/functional_postal_zip_code.yaml b/examples/divisions/geographic_area/functional_postal_zip_code.yaml new file mode 100644 index 000000000..45df1a5ea --- /dev/null +++ b/examples/divisions/geographic_area/functional_postal_zip_code.yaml @@ -0,0 +1,38 @@ +--- +id: example:geography:functional:us_zip_90210 +type: Feature +geometry: + type: Polygon + coordinates: [ + [ + [-118.416, 34.088], + [-118.396, 34.088], + [-118.396, 34.108], + [-118.416, 34.108], + [-118.416, 34.088] + ] + ] +bbox: [-118.416, 34.088, -118.396, 34.108] +properties: + theme: divisions + type: geographic_area + version: 0 + subtype: functional + class: postal + names: + primary: "90210" + common: + en: Beverly Hills ZIP Code + cartography: + prominence: 30 + min_zoom: 11 + max_zoom: 15 + associated_division_ids: + - example:division:locality:beverly_hills + population: 21733 + wikidata: Q3271856 + sources: + - property: "" + dataset: US Census Bureau + record_id: ZCTA5/90210 + update_time: "2023-08-15T12:00:00Z" diff --git a/gers/examples/python/constants.py b/gers/examples/python/constants.py index f15adb2d0..34776bb74 100644 --- a/gers/examples/python/constants.py +++ b/gers/examples/python/constants.py @@ -1,27 +1,24 @@ -from enum import Enum -import os - DEFAULT_H3_RESOLUTION = 12 # default params for nearest match -DEFAULT_NEAREST_MAX_DISTANCE = 100 # meters +DEFAULT_NEAREST_MAX_DISTANCE = 100 # meters # default params for trace snapping -DEFAULT_SIGMA = 4.1 # 4.10351310622546; -DEFAULT_BETA = 0.9 # 0.905918746744877 -> this default beta was found to apply to a 5 second sample rate. +DEFAULT_SIGMA = 4.1 # 4.10351310622546; +DEFAULT_BETA = 0.9 # 0.905918746744877 -> this default beta was found to apply to a 5 second sample rate. # also was found to have good noise rejection characteristics and performed just as well or better than 1 second data, so it # is now our default sampling period - even if the raw data was sampled at a higher rate -DEFAULT_MAX_POINT_TO_ROAD_DISTANCE = 10 # 200m in original paper -DEFAULT_MAX_ROUTE_TO_TRACE_DISTANCE_DIFFERENCE = 300 # what's a good value for this? 2km in original paper but too slow +DEFAULT_MAX_POINT_TO_ROAD_DISTANCE = 10 # 200m in original paper +DEFAULT_MAX_ROUTE_TO_TRACE_DISTANCE_DIFFERENCE = ( + 300 # what's a good value for this? 2km in original paper but too slow +) DEFAULT_ALLOW_LOOPS = False -DEFAULT_SEGMENT_REVISIT_PENALTY = 100 # set to 0 if no penalty is desired -DEFAULT_VIA_POINT_PENALTY_WEIGHT = 100 # set to 0 if no penalty is desired -DEFAULT_BROKEN_TIME_GAP_RESET_SEQUENCE = 60 # seconds -DEFAULT_BROKEN_DISTANCE_GAP_RESET_SEQUENCE = 300 # meters +DEFAULT_SEGMENT_REVISIT_PENALTY = 100 # set to 0 if no penalty is desired +DEFAULT_VIA_POINT_PENALTY_WEIGHT = 100 # set to 0 if no penalty is desired +DEFAULT_BROKEN_TIME_GAP_RESET_SEQUENCE = 60 # seconds +DEFAULT_BROKEN_DISTANCE_GAP_RESET_SEQUENCE = 300 # meters """default column separator of text files""" COLUMN_SEPARATOR = "\t" DATA_DIR = "gers/examples/python/data" - - diff --git a/gers/examples/python/match_classes.py b/gers/examples/python/match_classes.py index 0e1c55805..05bfae56b 100644 --- a/gers/examples/python/match_classes.py +++ b/gers/examples/python/match_classes.py @@ -1,39 +1,67 @@ import json -from typing import Dict, Iterable +from collections.abc import Iterable + +import constants from shapely.geometry import Point from shapely.geometry.base import BaseGeometry -import constants + class MatchableFeature: """ Convenience class to hold an id, a shapely geometry, and optionally a dictionary of properties for use in matching. It can be trivially populated from geojson and overture as an extension of geojson. """ - def __init__(self, id: str, geometry:BaseGeometry, properties: dict=None) -> None: + + def __init__( + self, id: str, geometry: BaseGeometry, properties: dict = None + ) -> None: self.id = str(id) self.geometry = geometry self.properties = properties def __str__(self) -> str: - return json.dumps({ - "id": self.id, - "geometry": self.geometry.wkt, - "properties": self.properties - }) + return json.dumps( + { + "id": self.id, + "geometry": self.geometry.wkt, + "properties": self.properties, + } + ) def get_connector_ids(self) -> Iterable[str]: - return self.properties["connector_ids"] if self.properties is not None and "connector_ids" in self.properties else [] + return ( + self.properties["connector_ids"] + if self.properties is not None and "connector_ids" in self.properties + else [] + ) + class MatchableFeaturesSet: """Collection of matchable features, indexed by id, and by cells (H3 in current implementation)""" - def __init__(self, features: Dict[str, Iterable[MatchableFeature]], cells_by_id: Dict[str, Iterable[str]], features_by_cell: Dict[str, Iterable[MatchableFeature]]) -> None: + + def __init__( + self, + features: dict[str, Iterable[MatchableFeature]], + cells_by_id: dict[str, Iterable[str]], + features_by_cell: dict[str, Iterable[MatchableFeature]], + ) -> None: self.features_by_id = features self.cells_by_id = cells_by_id self.features_by_cell = features_by_cell + class MatchedFeature: """One matched feature with match-relevant information""" - def __init__(self, id: str, matched_feature: MatchableFeature, overlapping_geometry: BaseGeometry, score: float, source_lr: Iterable[float]=None, candidate_lr: Iterable[float]=None) -> None: + + def __init__( + self, + id: str, + matched_feature: MatchableFeature, + overlapping_geometry: BaseGeometry, + score: float, + source_lr: Iterable[float] = None, + candidate_lr: Iterable[float] = None, + ) -> None: """ Attributes: id: the gers id of the matched feature @@ -43,7 +71,7 @@ def __init__(self, id: str, matched_feature: MatchableFeature, overlapping_geome source_lr: the Location Reference in the source geometry of the part that matched as array of from-to points projection factors candidate_lr: the Location Reference in the matched geometry of the part that matched the source geometry """ - self.id = id # the gers id of the matched feature + self.id = id # the gers id of the matched feature self.matched_feature = matched_feature self.overlapping_geometry = overlapping_geometry self.score = score @@ -54,7 +82,9 @@ def to_json(self): j = { "id": str(self.id), "candidate_wkt": self.matched_feature.geometry.wkt, - "overlapping_wkt": self.overlapping_geometry.wkt if self.overlapping_geometry is not None else None, + "overlapping_wkt": self.overlapping_geometry.wkt + if self.overlapping_geometry is not None + else None, "score": self.score, } if self.source_lr is not None: @@ -66,30 +96,38 @@ def to_json(self): def __str__(self) -> str: return json.dumps(self.to_json()) + class TraceSnapOptions: - """"Parameters for matching a trace to road segments""" - def __init__(self, \ - sigma=constants.DEFAULT_SIGMA,\ - beta=constants.DEFAULT_BETA,\ - max_point_to_road_distance=constants.DEFAULT_MAX_POINT_TO_ROAD_DISTANCE,\ - max_route_to_trace_distance_difference=constants.DEFAULT_MAX_ROUTE_TO_TRACE_DISTANCE_DIFFERENCE,\ - allow_loops=constants.DEFAULT_ALLOW_LOOPS, - revisit_segment_penalty_weight=constants.DEFAULT_SEGMENT_REVISIT_PENALTY, - revisit_via_point_penalty_weight=constants.DEFAULT_VIA_POINT_PENALTY_WEIGHT, - broken_time_gap_reset_sequence=constants.DEFAULT_BROKEN_TIME_GAP_RESET_SEQUENCE, - broken_distance_gap_reset_sequence=constants.DEFAULT_BROKEN_DISTANCE_GAP_RESET_SEQUENCE) -> None: + """ "Parameters for matching a trace to road segments""" + + def __init__( + self, + sigma=constants.DEFAULT_SIGMA, + beta=constants.DEFAULT_BETA, + max_point_to_road_distance=constants.DEFAULT_MAX_POINT_TO_ROAD_DISTANCE, + max_route_to_trace_distance_difference=constants.DEFAULT_MAX_ROUTE_TO_TRACE_DISTANCE_DIFFERENCE, + allow_loops=constants.DEFAULT_ALLOW_LOOPS, + revisit_segment_penalty_weight=constants.DEFAULT_SEGMENT_REVISIT_PENALTY, + revisit_via_point_penalty_weight=constants.DEFAULT_VIA_POINT_PENALTY_WEIGHT, + broken_time_gap_reset_sequence=constants.DEFAULT_BROKEN_TIME_GAP_RESET_SEQUENCE, + broken_distance_gap_reset_sequence=constants.DEFAULT_BROKEN_DISTANCE_GAP_RESET_SEQUENCE, + ) -> None: self.sigma = sigma self.beta = beta self.allow_loops = allow_loops self.max_point_to_road_distance = max_point_to_road_distance - self.max_route_to_trace_distance_difference = max_route_to_trace_distance_difference + self.max_route_to_trace_distance_difference = ( + max_route_to_trace_distance_difference + ) self.revisit_segment_penalty_weight = revisit_segment_penalty_weight self.revisit_via_point_penalty_weight = revisit_via_point_penalty_weight self.broken_time_gap_reset_sequence = broken_time_gap_reset_sequence self.broken_distance_gap_reset_sequence = broken_distance_gap_reset_sequence + class RouteStep: """One step in a route, corresponding to one road segment feature""" + def __init__(self, feature: MatchableFeature, via_point: Point) -> None: """ Attributes: @@ -99,15 +137,34 @@ def __init__(self, feature: MatchableFeature, via_point: Point) -> None: self.feature = feature self.via_point = via_point + class Route: """A route, consisting of a sequence of steps""" + def __init__(self, distance: float, steps: Iterable[RouteStep]) -> None: self.distance = distance self.steps = steps + class SnappedPointPrediction: """A road segment feature as a snap prediction for point in a trace, with relevant match signals""" - def __init__(self, id: str, snapped_point: Point, referenced_feature: MatchableFeature, distance_to_snapped_road: float, route_distance_to_prev_point: float, emission_prob: float, best_transition_prob: float, best_log_prob: float, best_prev_prediction: float, best_sequence: Iterable[str], best_route_via_points: Iterable[str], best_revisited_via_points_count:int, best_revisited_segments_count:int) -> None: + + def __init__( + self, + id: str, + snapped_point: Point, + referenced_feature: MatchableFeature, + distance_to_snapped_road: float, + route_distance_to_prev_point: float, + emission_prob: float, + best_transition_prob: float, + best_log_prob: float, + best_prev_prediction: float, + best_sequence: Iterable[str], + best_route_via_points: Iterable[str], + best_revisited_via_points_count: int, + best_revisited_segments_count: int, + ) -> None: self.id = str(id) self.snapped_point = snapped_point self.referenced_feature = referenced_feature @@ -146,9 +203,18 @@ def to_json(self, diagnostic_mode=False): return j + class PointSnapInfo: """Snap-to-road match information corresponding to one point in a trace""" - def __init__(self, index: int, original_point: Point, time: str, seconds_since_prev_point: float=None, predictions:Iterable[SnappedPointPrediction]=[]) -> None: + + def __init__( + self, + index: int, + original_point: Point, + time: str, + seconds_since_prev_point: float = None, + predictions: Iterable[SnappedPointPrediction] = [], + ) -> None: self.index = index self.original_point = original_point self.time = time @@ -157,8 +223,16 @@ def __init__(self, index: int, original_point: Point, time: str, seconds_since_p self.best_prediction = None self.ignore = False - def to_json(self, diagnostic_mode: bool=False, include_all_predictions: bool=False,): - best_prediction_json = None if self.best_prediction is None else self.best_prediction.to_json(diagnostic_mode) + def to_json( + self, + diagnostic_mode: bool = False, + include_all_predictions: bool = False, + ): + best_prediction_json = ( + None + if self.best_prediction is None + else self.best_prediction.to_json(diagnostic_mode) + ) j = { "original_point": self.original_point.wkt, @@ -174,12 +248,31 @@ def to_json(self, diagnostic_mode: bool=False, include_all_predictions: bool=Fal j["point_index"] = self.index if include_all_predictions: - j["predictions"] = list(map(lambda x: x.to_json(diagnostic_mode), self.predictions)) + j["predictions"] = list( + map(lambda x: x.to_json(diagnostic_mode), self.predictions) + ) return j + class TraceMatchResult: """Result of a matching trace to road segments""" - def __init__(self, id: str, source_wkt: str, points: Iterable[PointSnapInfo], source_length: float, target_candidates_count: int, matched_target_ids: Iterable[str]=None, elapsed: float=None, sequence_breaks: int=0, points_with_matches: int=0, route_length: float=0, avg_dist_to_road: float=None, revisited_via_points: int=0, revisited_segments: int=0) -> None: + + def __init__( + self, + id: str, + source_wkt: str, + points: Iterable[PointSnapInfo], + source_length: float, + target_candidates_count: int, + matched_target_ids: Iterable[str] = None, + elapsed: float = None, + sequence_breaks: int = 0, + points_with_matches: int = 0, + route_length: float = 0, + avg_dist_to_road: float = None, + revisited_via_points: int = 0, + revisited_segments: int = 0, + ) -> None: self.id = id self.source_wkt = source_wkt self.points = points @@ -195,7 +288,12 @@ def __init__(self, id: str, source_wkt: str, points: Iterable[PointSnapInfo], so self.revisited_segments = revisited_segments def to_json(self, diagnostic_mode=False, include_all_predictions=False): - points_json = list(map(lambda x: x.to_json(diagnostic_mode, include_all_predictions), self.points)) + points_json = list( + map( + lambda x: x.to_json(diagnostic_mode, include_all_predictions), + self.points, + ) + ) return { "id": str(self.id), "elapsed": self.elapsed, @@ -209,7 +307,7 @@ def to_json(self, diagnostic_mode=False, include_all_predictions=False): "revisited_segments": self.revisited_segments, "target_candidates_count": self.target_candidates_count, "target_ids": self.matched_target_ids, - "points": points_json + "points": points_json, } def __str__(self) -> str: diff --git a/gers/examples/python/match_traces.ipynb b/gers/examples/python/match_traces.ipynb index 413f761fd..8f8618a0a 100644 --- a/gers/examples/python/match_traces.ipynb +++ b/gers/examples/python/match_traces.ipynb @@ -158,18 +158,21 @@ } ], "source": [ - "\n", "to_match_gdf = gpd.read_file(input_to_match_file)\n", "to_match_gdf.crs = \"epsg:4326\"\n", "\n", "# add column \"id_to_match\"; this will be used to group all match candidates for a feature, so it must be unique within your data set.\n", "# if your data doesn't have an id column, you can use to_match_gdf.index for example\n", - "to_match_gdf[\"id_to_match\"] = to_match_gdf[\"id\"] \n", + "to_match_gdf[\"id_to_match\"] = to_match_gdf[\"id\"]\n", "property_columns = to_match_gdf.columns.difference([\"geometry\", \"id\", \"type\"])\n", - "to_match_gdf[\"properties\"] = to_match_gdf[property_columns].apply(lambda x: x.to_dict(), axis=1)\n", + "to_match_gdf[\"properties\"] = to_match_gdf[property_columns].apply(\n", + " lambda x: x.to_dict(), axis=1\n", + ")\n", "\n", "# construct a MatchableFeature object for each feature\n", - "to_match_gdf[\"feature_to_match\"] = to_match_gdf.apply(lambda row: MatchableFeature(row.id, row.geometry, row.properties), axis=1)\n", + "to_match_gdf[\"feature_to_match\"] = to_match_gdf.apply(\n", + " lambda row: MatchableFeature(row.id, row.geometry, row.properties), axis=1\n", + ")\n", "\n", "to_match_gdf.head(3)" ] @@ -297,12 +300,16 @@ "overture_gdf = gpd.read_file(input_overture_file)\n", "overture_gdf.crs = \"epsg:4326\"\n", "\n", - "# combine properties into a single column \n", + "# combine properties into a single column\n", "property_columns = overture_gdf.columns.difference([\"geometry\", \"id\", \"type\"])\n", - "overture_gdf[\"properties\"] = overture_gdf[property_columns].apply(lambda x: x.to_dict(), axis=1)\n", + "overture_gdf[\"properties\"] = overture_gdf[property_columns].apply(\n", + " lambda x: x.to_dict(), axis=1\n", + ")\n", "\n", "# construct a MatchableFeature object for each feature\n", - "overture_gdf[\"candidate_feature\"] = overture_gdf.apply(lambda row: MatchableFeature(row.id, row.geometry, row.properties), axis=1)\n", + "overture_gdf[\"candidate_feature\"] = overture_gdf.apply(\n", + " lambda row: MatchableFeature(row.id, row.geometry, row.properties), axis=1\n", + ")\n", "\n", "overture_gdf.head(3)" ] @@ -510,10 +517,14 @@ "to_match_utm_gdf[\"original_geometry\"] = to_match_utm_gdf[\"geometry\"].copy(deep=True)\n", "\n", "# add 20 meters buffer to the features to match, to account for the fact that the overture features are not perfectly aligned with the features to be matched\n", - "to_match_utm_gdf[\"geometry\"] = to_match_utm_gdf[\"original_geometry\"].buffer(20, cap_style=2)\n", + "to_match_utm_gdf[\"geometry\"] = to_match_utm_gdf[\"original_geometry\"].buffer(\n", + " 20, cap_style=2\n", + ")\n", "\n", "# spatial join between points and segments - get nearest overture feature, where distance < 100 meters\n", - "joined_gdf = sjoin(to_match_utm_gdf, candidates_utm_gdf, how=\"left\", predicate=\"intersects\")\n", + "joined_gdf = sjoin(\n", + " to_match_utm_gdf, candidates_utm_gdf, how=\"left\", predicate=\"intersects\"\n", + ")\n", "joined_gdf.head(3)" ] }, @@ -590,7 +601,9 @@ ], "source": [ "# group join results by id_to_match, and aggregate the candidate features into a list\n", - "grouped_df = joined_gdf.groupby([\"id_to_match\"]).agg({\"feature_to_match\": \"first\", \"candidate_feature\": lambda x: list(x)})\n", + "grouped_df = joined_gdf.groupby([\"id_to_match\"]).agg(\n", + " {\"feature_to_match\": \"first\", \"candidate_feature\": lambda x: list(x)}\n", + ")\n", "grouped_df = grouped_df.reset_index()\n", "grouped_df.head(3)" ] @@ -677,8 +690,11 @@ ], "source": [ "# run the trace matching algorithm for each trace feature to match\n", - "options = TraceSnapOptions(max_point_to_road_distance=10) \n", - "grouped_df[\"match_result\"] = grouped_df.apply(lambda row: get_trace_matches(row.feature_to_match, row.candidate_feature, options), axis=1)\n", + "options = TraceSnapOptions(max_point_to_road_distance=10)\n", + "grouped_df[\"match_result\"] = grouped_df.apply(\n", + " lambda row: get_trace_matches(row.feature_to_match, row.candidate_feature, options),\n", + " axis=1,\n", + ")\n", "grouped_df.head(3)" ] }, @@ -690,8 +706,9 @@ "source": [ "# save the results to a file, result as one json line per feature to match\n", "import numpy as np\n", + "\n", "results_df = grouped_df.apply(lambda x: x.match_result.to_json(), axis=1)\n", - "np.savetxt(output_file, results_df.values, fmt='%s')" + "np.savetxt(output_file, results_df.values, fmt=\"%s\")" ] }, { @@ -835,16 +852,34 @@ ], "source": [ "# add some of the metrics from the result object as columns for analysis\n", - "grouped_df[\"source_length\"] = grouped_df.apply(lambda x: x.match_result.source_length, axis=1)\n", - "grouped_df[\"route_length\"] = grouped_df.apply(lambda x: x.match_result.route_length, axis=1)\n", + "grouped_df[\"source_length\"] = grouped_df.apply(\n", + " lambda x: x.match_result.source_length, axis=1\n", + ")\n", + "grouped_df[\"route_length\"] = grouped_df.apply(\n", + " lambda x: x.match_result.route_length, axis=1\n", + ")\n", "grouped_df[\"points\"] = grouped_df.apply(lambda x: len(x.match_result.points), axis=1)\n", - "grouped_df[\"points_with_matches\"] = grouped_df.apply(lambda x: x.match_result.points_with_matches, axis=1)\n", - "grouped_df[\"avg_dist_to_road\"] = grouped_df.apply(lambda x: x.match_result.avg_dist_to_road, axis=1)\n", - "grouped_df[\"sequence_breaks\"] = grouped_df.apply(lambda x: x.match_result.sequence_breaks, axis=1)\n", - "grouped_df[\"revisited_via_points\"] = grouped_df.apply(lambda x: x.match_result.revisited_via_points, axis=1)\n", - "grouped_df[\"revisited_segments\"] = grouped_df.apply(lambda x: x.match_result.revisited_segments, axis=1)\n", - "grouped_df[\"candidates_count\"] = grouped_df.apply(lambda x: x.match_result.target_candidates_count, axis=1)\n", - "grouped_df[\"matched_segments\"] = grouped_df.apply(lambda x: len(x.match_result.matched_target_ids), axis=1)\n", + "grouped_df[\"points_with_matches\"] = grouped_df.apply(\n", + " lambda x: x.match_result.points_with_matches, axis=1\n", + ")\n", + "grouped_df[\"avg_dist_to_road\"] = grouped_df.apply(\n", + " lambda x: x.match_result.avg_dist_to_road, axis=1\n", + ")\n", + "grouped_df[\"sequence_breaks\"] = grouped_df.apply(\n", + " lambda x: x.match_result.sequence_breaks, axis=1\n", + ")\n", + "grouped_df[\"revisited_via_points\"] = grouped_df.apply(\n", + " lambda x: x.match_result.revisited_via_points, axis=1\n", + ")\n", + "grouped_df[\"revisited_segments\"] = grouped_df.apply(\n", + " lambda x: x.match_result.revisited_segments, axis=1\n", + ")\n", + "grouped_df[\"candidates_count\"] = grouped_df.apply(\n", + " lambda x: x.match_result.target_candidates_count, axis=1\n", + ")\n", + "grouped_df[\"matched_segments\"] = grouped_df.apply(\n", + " lambda x: len(x.match_result.matched_target_ids), axis=1\n", + ")\n", "grouped_df[\"elapsed\"] = grouped_df.apply(lambda x: x.match_result.elapsed, axis=1)\n", "grouped_df.head(3)" ] diff --git a/gers/examples/python/match_traces.py b/gers/examples/python/match_traces.py index bbf2e6a02..8b55cde97 100644 --- a/gers/examples/python/match_traces.py +++ b/gers/examples/python/match_traces.py @@ -1,25 +1,40 @@ import argparse import csv import json -import os import math +import os +from collections.abc import Iterable +from timeit import default_timer as timer import constants +from match_classes import ( + MatchableFeature, + PointSnapInfo, + RouteStep, + SnappedPointPrediction, + TraceMatchResult, + TraceSnapOptions, +) from route_utils import get_shortest_route -from match_classes import TraceSnapOptions, MatchableFeature, TraceMatchResult, SnappedPointPrediction, PointSnapInfo, RouteStep -from utils import get_features_with_cells, get_seconds_elapsed, get_distance, get_linestring_length, load_matchable_set - from shapely import Point from shapely.ops import nearest_points -from timeit import default_timer as timer -from typing import Dict, Iterable - -def get_feature_id_to_connected_features(features_overture: Iterable[MatchableFeature]) -> Dict[str, Iterable[MatchableFeature]]: +from utils import ( + get_distance, + get_features_with_cells, + get_linestring_length, + get_seconds_elapsed, + load_matchable_set, +) + + +def get_feature_id_to_connected_features( + features_overture: Iterable[MatchableFeature], +) -> dict[str, Iterable[MatchableFeature]]: """returns a connected roads "graph" as a dictionary of feature id to features that are connected to it, as modeled in overture schema via connector_ids property""" connector_id_to_features = {} for feature in features_overture: for connector_id in feature.get_connector_ids(): - if not connector_id in connector_id_to_features: + if connector_id not in connector_id_to_features: connector_id_to_features[connector_id] = [] connector_id_to_features[connector_id].append(feature) @@ -32,36 +47,56 @@ def get_feature_id_to_connected_features(features_overture: Iterable[MatchableFe feature_id_to_connected_features[feature.id].append(other_feature) return feature_id_to_connected_features + def read_predictions(predictions_file: str): """reads snap predictions from tab separated file with columns: trace_id, point_index, gers_id, score""" p = {} - with open(predictions_file, 'r') as file: + with open(predictions_file) as file: reader = csv.reader(file, delimiter=constants.COLUMN_SEPARATOR) for row in reader: try: trace_id = row[0] point_index = int(row[1]) gers_id = row[3] - if not(trace_id in p): + if trace_id not in p: p[trace_id] = {} p[trace_id][point_index] = gers_id except ValueError: - continue # header or invalid line + continue # header or invalid line return p -def calculate_error_rate(labeled_file: str, target_features_by_id: Dict[str, Iterable[MatchableFeature]], match_results: Iterable[TraceMatchResult]): + +def calculate_error_rate( + labeled_file: str, + target_features_by_id: dict[str, Iterable[MatchableFeature]], + match_results: Iterable[TraceMatchResult], +): """returns total error rate from a labeled file and a list of trace match results""" - if not(os.path.exists(labeled_file)): - print(f'no metrics to compute (file {labeled_file} does not exist)') + if not (os.path.exists(labeled_file)): + print(f"no metrics to compute (file {labeled_file} does not exist)") return labels = read_predictions(labeled_file) total_correct_distance = 0 total_incorrect_distance = 0 - with open(labeled_file + ".actual.txt",'w') as f: - f.write(constants.COLUMN_SEPARATOR.join(["trace_id", "point_index", "label_gers_id", "prediction_gers_id", "label_snapped_wkt", "prediction_snapped_wkt", "distance_to_prev_point", "is_correct"]) + "\n") + with open(labeled_file + ".actual.txt", "w") as f: + f.write( + constants.COLUMN_SEPARATOR.join( + [ + "trace_id", + "point_index", + "label_gers_id", + "prediction_gers_id", + "label_snapped_wkt", + "prediction_snapped_wkt", + "distance_to_prev_point", + "is_correct", + ] + ) + + "\n" + ) for trace_match_result in match_results: - if not(trace_match_result.id in labels): + if trace_match_result.id not in labels: continue correct_distance = 0 @@ -69,13 +104,17 @@ def calculate_error_rate(labeled_file: str, target_features_by_id: Dict[str, Ite prev_point = None for point in trace_match_result.points: - if not(point.index in labels[trace_match_result.id]): - print(f'no label for trace_id={trace_match_result.id} point_index={point.index}') + if point.index not in labels[trace_match_result.id]: + print( + f"no label for trace_id={trace_match_result.id} point_index={point.index}" + ) break label_gers_id = labels[trace_match_result.id][point.index] dist_to_prev_point = 0 - is_correct = not(point.best_prediction is None) and (str(point.best_prediction.id) == label_gers_id) + is_correct = point.best_prediction is not None and ( + str(point.best_prediction.id) == label_gers_id + ) if prev_point is not None: dist_to_prev_point = get_distance(prev_point, point.original_point) correct_distance += dist_to_prev_point @@ -88,78 +127,124 @@ def calculate_error_rate(labeled_file: str, target_features_by_id: Dict[str, Ite incorrect_distance += dist_to_prev_point label_snapped_point = None - if not(label_gers_id in target_features_by_id): - print(f'no target feature for label_gers_id={label_gers_id}') + if label_gers_id not in target_features_by_id: + print(f"no target feature for label_gers_id={label_gers_id}") else: label_shape = target_features_by_id[label_gers_id].geometry - x, label_snapped_point = nearest_points(point.original_point, label_shape) - - columns = [\ - str(trace_match_result.id), \ - str(point.index), \ - str(label_gers_id), \ - str(point.best_prediction.id) if point.best_prediction is not None else "", \ - label_snapped_point.wkt if label_snapped_point is not None else "", \ - point.best_prediction.snapped_point.wkt if point.best_prediction is not None else "", \ - str(dist_to_prev_point), \ - str(is_correct), \ - ] + x, label_snapped_point = nearest_points( + point.original_point, label_shape + ) + + columns = [ + str(trace_match_result.id), + str(point.index), + str(label_gers_id), + str(point.best_prediction.id) + if point.best_prediction is not None + else "", + label_snapped_point.wkt if label_snapped_point is not None else "", + point.best_prediction.snapped_point.wkt + if point.best_prediction is not None + else "", + str(dist_to_prev_point), + str(is_correct), + ] f.write(constants.COLUMN_SEPARATOR.join(columns) + "\n") prev_point = point.original_point trace_error_rate = incorrect_distance / correct_distance - print(rf"trace_id={trace_match_result.id} trace_error_rate={trace_error_rate:.2f} correct_distance={correct_distance:.2f} incorrect_distance={incorrect_distance:.2f}") + print( + rf"trace_id={trace_match_result.id} trace_error_rate={trace_error_rate:.2f} correct_distance={correct_distance:.2f} incorrect_distance={incorrect_distance:.2f}" + ) total_correct_distance += correct_distance total_incorrect_distance += incorrect_distance if total_correct_distance == 0: - print('no correct distance') + print("no correct distance") return -1 total_error_rate = total_incorrect_distance / total_correct_distance - print(rf"total_error_rate={total_error_rate:.2f} total_correct_distance={total_correct_distance:.2f} total_incorrect_distance={total_incorrect_distance:.2f}") + print( + rf"total_error_rate={total_error_rate:.2f} total_correct_distance={total_correct_distance:.2f} total_incorrect_distance={total_incorrect_distance:.2f}" + ) return total_error_rate -def output_trace_snap_results(match_results: Iterable[TraceMatchResult], output_file_name: str, output_for_judgment: bool = False): - results_json = list(map(lambda x: x.to_json(diagnostic_mode=False, include_all_predictions=False), match_results)) - with open(output_file_name, 'w') as f: + +def output_trace_snap_results( + match_results: Iterable[TraceMatchResult], + output_file_name: str, + output_for_judgment: bool = False, +): + results_json = list( + map( + lambda x: x.to_json(diagnostic_mode=False, include_all_predictions=False), + match_results, + ) + ) + with open(output_file_name, "w") as f: json.dump(results_json, f, indent=4) - results_json = list(map(lambda x: x.to_json(diagnostic_mode=True, include_all_predictions=False), match_results)) - with open(output_file_name + ".with_diagnostics.json", 'w') as f: + results_json = list( + map( + lambda x: x.to_json(diagnostic_mode=True, include_all_predictions=False), + match_results, + ) + ) + with open(output_file_name + ".with_diagnostics.json", "w") as f: json.dump(results_json, f, indent=4) - results_json = list(map(lambda x: x.to_json(diagnostic_mode=True, include_all_predictions=True), match_results)) - with open(output_file_name + ".with_diagnostics-all-predictions.json", 'w') as f: + results_json = list( + map( + lambda x: x.to_json(diagnostic_mode=True, include_all_predictions=True), + match_results, + ) + ) + with open(output_file_name + ".with_diagnostics-all-predictions.json", "w") as f: json.dump(results_json, f, indent=4) if output_for_judgment: - with open(output_file_name + ".for_judgment.txt",'w') as f: - f.write(constants.COLUMN_SEPARATOR.join(["trace_id", "point_index", "trace_point_wkt", "gers_id"]) + "\n") + with open(output_file_name + ".for_judgment.txt", "w") as f: + f.write( + constants.COLUMN_SEPARATOR.join( + ["trace_id", "point_index", "trace_point_wkt", "gers_id"] + ) + + "\n" + ) for r in match_results: for idx, p in enumerate(r.points): columns = [ str(r.id), str(idx), p.original_point.wkt, - str(p.best_prediction.id) if p.best_prediction is not None else "" + str(p.best_prediction.id) + if p.best_prediction is not None + else "", ] f.write(constants.COLUMN_SEPARATOR.join(columns) + "\n") - with open(output_file_name + ".snapped_points.txt",'w') as f: - f.write(constants.COLUMN_SEPARATOR.join(["trace_id", "point_index", "gers_id", "snapped_point_wkt"]) + "\n") + with open(output_file_name + ".snapped_points.txt", "w") as f: + f.write( + constants.COLUMN_SEPARATOR.join( + ["trace_id", "point_index", "gers_id", "snapped_point_wkt"] + ) + + "\n" + ) for r in match_results: for idx, p in enumerate(r.points): columns = [ str(r.id), str(idx), - str(p.best_prediction.id) if p.best_prediction is not None else "", - p.best_prediction.snapped_point.wkt if p.best_prediction is not None else "" + str(p.best_prediction.id) + if p.best_prediction is not None + else "", + p.best_prediction.snapped_point.wkt + if p.best_prediction is not None + else "", ] f.write(constants.COLUMN_SEPARATOR.join(columns) + "\n") - with open(output_file_name + ".auto_metrics.txt",'w') as f: + with open(output_file_name + ".auto_metrics.txt", "w") as f: header = [ "id", "source_length", @@ -174,7 +259,7 @@ def output_trace_snap_results(match_results: Iterable[TraceMatchResult], output_ "revisited_via_points", "revisited_segments", "elapsed", - "source_wkt" + "source_wkt", ] f.write(constants.COLUMN_SEPARATOR.join(header) + "\n") for r in match_results: @@ -184,7 +269,7 @@ def output_trace_snap_results(match_results: Iterable[TraceMatchResult], output_ str(r.route_length), str(len(r.points)), str(r.points_with_matches), - rf"{(100*r.points_with_matches/len(r.points)):.2f}", + rf"{(100 * r.points_with_matches / len(r.points)):.2f}", str(r.target_candidates_count), str(len(r.matched_target_ids)), str(r.avg_dist_to_road), @@ -196,31 +281,51 @@ def output_trace_snap_results(match_results: Iterable[TraceMatchResult], output_ ] f.write(constants.COLUMN_SEPARATOR.join(columns) + "\n") + def set_best_path_predictions(points: Iterable[PointSnapInfo]): """Sets the best prediction for each point in the sequence, starting from the end and going backwards following the best_prev_prediction chain""" last_point = points[-1] - if last_point.predictions is None or len(last_point.predictions) == 0 or last_point.predictions[0].best_log_prob == 0: - return # no path found - - last_point.best_prediction = last_point.predictions[0] # this is sorted descending by probability, so the first one is the best - for idx in range(len(points)-2, -1, -1): + if ( + last_point.predictions is None + or len(last_point.predictions) == 0 + or last_point.predictions[0].best_log_prob == 0 + ): + return # no path found + + last_point.best_prediction = last_point.predictions[ + 0 + ] # this is sorted descending by probability, so the first one is the best + for idx in range(len(points) - 2, -1, -1): if points[idx + 1].best_prediction is not None: - points[idx].best_prediction = points[idx + 1].best_prediction.best_prev_prediction + points[idx].best_prediction = points[ + idx + 1 + ].best_prediction.best_prev_prediction else: - if not(points[idx].ignore) and len(points[idx].predictions) > 0: + if not (points[idx].ignore) and len(points[idx].predictions) > 0: points[idx].best_prediction = points[idx].predictions[0] -def extend_sequence(steps: Iterable[RouteStep], prev_prediction: SnappedPointPrediction): + +def extend_sequence( + steps: Iterable[RouteStep], prev_prediction: SnappedPointPrediction +): """Extends the sequence of the traveled segments up to the previous point with the new steps; also returns the number of revisited segments and via points""" revisited_via_points_count = 0 revisited_segments_count = 0 - extended_sequence = prev_prediction.best_sequence.copy() if prev_prediction.best_sequence is not None else [] + extended_sequence = ( + prev_prediction.best_sequence.copy() + if prev_prediction.best_sequence is not None + else [] + ) revisited_segments_count = 0 added_via_points = [] for step in steps: - if len(extended_sequence) == 0 or step.feature.id != extended_sequence[-1]: # either first step or new feature - if len(extended_sequence) > 0 and step.feature.id in extended_sequence: # different than prev segment but present in the sequence, so we are revisiting it + if ( + len(extended_sequence) == 0 or step.feature.id != extended_sequence[-1] + ): # either first step or new feature + if ( + len(extended_sequence) > 0 and step.feature.id in extended_sequence + ): # different than prev segment but present in the sequence, so we are revisiting it revisited_segments_count += 1 extended_sequence.append(step.feature.id) if step.via_point is not None: @@ -234,7 +339,7 @@ def extend_sequence(steps: Iterable[RouteStep], prev_prediction: SnappedPointPre for vp in p.best_route_via_points: all_prev_via_points.add(vp) if len(all_prev_via_points) > 100: - break # optimization for very long traces, don't need to check all of them, just the recent ones + break # optimization for very long traces, don't need to check all of them, just the recent ones p = p.best_prev_prediction for added_via_point in added_via_points: @@ -242,20 +347,26 @@ def extend_sequence(steps: Iterable[RouteStep], prev_prediction: SnappedPointPre revisited_via_points_count += 1 return (extended_sequence, revisited_segments_count, revisited_via_points_count) -def get_trace_matches(source_feature: MatchableFeature, target_candidates: Iterable[MatchableFeature], options: TraceSnapOptions) -> TraceMatchResult: + +def get_trace_matches( + source_feature: MatchableFeature, + target_candidates: Iterable[MatchableFeature], + options: TraceSnapOptions, +) -> TraceMatchResult: """Matches a `source_feature` trace to most likely traveled `targe_candidates` road segments""" start = timer() - feature_id_to_connected_features = get_feature_id_to_connected_features(target_candidates) + feature_id_to_connected_features = get_feature_id_to_connected_features( + target_candidates + ) filter_feature_ids = set(map(lambda x: x.id, target_candidates)) - times = source_feature.properties.get('times') + times = source_feature.properties.get("times") points = [] prev_point = None sequence_breaks = 0 for idx, coord in enumerate(source_feature.geometry.coords): - original_point = Point(coord[0], coord[1]) predictions = [] @@ -265,7 +376,10 @@ def get_trace_matches(source_feature: MatchableFeature, target_candidates: Itera if distance_to_road > options.max_point_to_road_distance: continue - emission_prob = (1 / (math.sqrt(2*math.pi) * options.sigma)) * math.exp(-0.5 * ((distance_to_road/options.sigma)**2)) # measurement probability - if was on this road how likely is it to have measured the point at this distance + emission_prob = ( + (1 / (math.sqrt(2 * math.pi) * options.sigma)) + * math.exp(-0.5 * ((distance_to_road / options.sigma) ** 2)) + ) # measurement probability - if was on this road how likely is it to have measured the point at this distance best_log_prob = None best_transition_prob = None best_prev_prediction = None @@ -281,32 +395,67 @@ def get_trace_matches(source_feature: MatchableFeature, target_candidates: Itera best_transition_prob = 1 best_sequence = [target_feature.id] else: - trace_dist_from_prev_point = get_distance(original_point, prev_point.original_point) + trace_dist_from_prev_point = get_distance( + original_point, prev_point.original_point + ) for prev_prediction in prev_point.predictions: - if not(options.allow_loops) and not(prev_prediction.best_sequence is None) and target_feature.id in prev_prediction.best_sequence and prev_prediction.referenced_feature.id != target_feature.id: + if ( + not (options.allow_loops) + and prev_prediction.best_sequence is not None + and target_feature.id in prev_prediction.best_sequence + and prev_prediction.referenced_feature.id != target_feature.id + ): # already part of best sequence, but then moved to a different segment, so this is not a good candidate, it means this would walk back on itself continue - route = get_shortest_route(target_candidates, feature_id_to_connected_features, prev_prediction.referenced_feature, target_feature, prev_prediction.snapped_point, snapped_point, filter_feature_ids, [] if options.allow_loops else prev_prediction.best_sequence) + route = get_shortest_route( + target_candidates, + feature_id_to_connected_features, + prev_prediction.referenced_feature, + target_feature, + prev_prediction.snapped_point, + snapped_point, + filter_feature_ids, + [] if options.allow_loops else prev_prediction.best_sequence, + ) # check distance is not float('inf') - if route is None or route.distance == float('inf') : + if route is None or route.distance == float("inf"): # couldn't find path, skip this prev_match as impossible to transition from it to this match continue dist_diff = abs(trace_dist_from_prev_point - route.distance) - transition_prob = (1 / options.beta) * math.exp(-dist_diff / options.beta) - - extended_sequence, revisited_segments_count, revisited_via_points_count = extend_sequence(route.steps, prev_prediction) - transition_prob *= math.exp(-revisited_via_points_count * options.revisit_via_point_penalty_weight) # todo: what's the right way to penalize revisiting via points? - transition_prob *= math.exp(-revisited_segments_count * options.revisit_segment_penalty_weight) # todo: what's the right way to penalize revisiting segments? - - if dist_diff > options.max_route_to_trace_distance_difference or transition_prob <= 0: + transition_prob = (1 / options.beta) * math.exp( + -dist_diff / options.beta + ) + + ( + extended_sequence, + revisited_segments_count, + revisited_via_points_count, + ) = extend_sequence(route.steps, prev_prediction) + transition_prob *= math.exp( + -revisited_via_points_count + * options.revisit_via_point_penalty_weight + ) # todo: what's the right way to penalize revisiting via points? + transition_prob *= math.exp( + -revisited_segments_count + * options.revisit_segment_penalty_weight + ) # todo: what's the right way to penalize revisiting segments? + + if ( + dist_diff > options.max_route_to_trace_distance_difference + or transition_prob <= 0 + ): continue - #match_prob = prev_prediction.best_prob * emission_prob * transition_prob + # match_prob = prev_prediction.best_prob * emission_prob * transition_prob # probabilities multiplied over many points go to zero (floating point underflow), so use log of product is sum of logs - match_log_prob = prev_prediction.best_log_prob + math.log(emission_prob) + math.log(transition_prob) - #print(f'point#{idx} prev_prediction={prev_prediction.id} transition_prob={transition_prob} emission_prob={emission_prob} match_prob={match_prob} route_dist_from_prev_point={route_dist_from_prev_point} trace_dist_from_prev_point={trace_dist_from_prev_point} dist_diff={dist_diff}') + match_log_prob = ( + prev_prediction.best_log_prob + + math.log(emission_prob) + + math.log(transition_prob) + ) + # print(f'point#{idx} prev_prediction={prev_prediction.id} transition_prob={transition_prob} emission_prob={emission_prob} match_prob={match_prob} route_dist_from_prev_point={route_dist_from_prev_point} trace_dist_from_prev_point={trace_dist_from_prev_point} dist_diff={dist_diff}') if best_log_prob is None or match_log_prob > best_log_prob: best_log_prob = match_log_prob best_transition_prob = transition_prob @@ -322,20 +471,42 @@ def get_trace_matches(source_feature: MatchableFeature, target_candidates: Itera # todo: also include the intermediate features in route.path if best_log_prob is None: - continue # couldn't find a path to this point, skip it - #print(f'point#{idx} candidate feature={target_feature.id} best_log_prob={best_log_prob} best_prev_point={best_prev_prediction.id if best_prev_prediction is not None else None} best_transition_prob={best_transition_prob} emission_prob={emission_prob} distance_to_road={distance_to_road}') - prediction = SnappedPointPrediction(target_feature.id, snapped_point, target_feature, distance_to_road, best_route_dist_from_prev_point, emission_prob, best_transition_prob, best_log_prob, best_prev_prediction, best_sequence, best_route_via_points, best_revisited_via_points_count, best_revisited_segments_count) + continue # couldn't find a path to this point, skip it + # print(f'point#{idx} candidate feature={target_feature.id} best_log_prob={best_log_prob} best_prev_point={best_prev_prediction.id if best_prev_prediction is not None else None} best_transition_prob={best_transition_prob} emission_prob={emission_prob} distance_to_road={distance_to_road}') + prediction = SnappedPointPrediction( + target_feature.id, + snapped_point, + target_feature, + distance_to_road, + best_route_dist_from_prev_point, + emission_prob, + best_transition_prob, + best_log_prob, + best_prev_prediction, + best_sequence, + best_route_via_points, + best_revisited_via_points_count, + best_revisited_segments_count, + ) predictions.append(prediction) predictions.sort(key=lambda x: x.best_log_prob, reverse=True) - time_since_prev_point = None if times is None or prev_point is None else get_seconds_elapsed(times[prev_point.index], times[idx]) + time_since_prev_point = ( + None + if times is None or prev_point is None + else get_seconds_elapsed(times[prev_point.index], times[idx]) + ) time = None if times is None else times[idx] - point = PointSnapInfo(idx, original_point, time, time_since_prev_point, predictions) + point = PointSnapInfo( + idx, original_point, time, time_since_prev_point, predictions + ) points.append(point) if len(predictions) > 0: - prev_point = point # don't update prev_point unless it has at least one prediction + prev_point = ( + point # don't update prev_point unless it has at least one prediction + ) else: # no predictions for this point, so ignore current point and previous point to attempt to recover sequence; # if gap between current point and prev_point is too big, abandon the prev_point and reset; @@ -346,9 +517,16 @@ def get_trace_matches(source_feature: MatchableFeature, target_candidates: Itera if prev_point.index > 0: prev_point = points[prev_point.index - 1] # gap with no candidates too big if 60seconds or 200m since last point - if (time_since_prev_point is not None and time_since_prev_point > options.broken_time_gap_reset_sequence) or \ - trace_dist_from_prev_point > options.broken_distance_gap_reset_sequence: - #print(rf"#{str(idx)}: sequence break; time_since_prev_point={time_since_prev_point} trace_dist_from_prev_point={trace_dist_from_prev_point}") + if ( + ( + time_since_prev_point is not None + and time_since_prev_point + > options.broken_time_gap_reset_sequence + ) + or trace_dist_from_prev_point + > options.broken_distance_gap_reset_sequence + ): + # print(rf"#{str(idx)}: sequence break; time_since_prev_point={time_since_prev_point} trace_dist_from_prev_point={trace_dist_from_prev_point}") # we have a sequence break, reset prev point, new sequence will start from next point sequence_breaks += 1 prev_point = None @@ -360,10 +538,19 @@ def get_trace_matches(source_feature: MatchableFeature, target_candidates: Itera end = timer() elapsed = end - start source_feature_length = get_linestring_length(source_feature.geometry) - t = TraceMatchResult(source_feature.id, source_feature.geometry.wkt, points, source_feature_length, len(target_candidates), elapsed=elapsed, sequence_breaks=sequence_breaks) + t = TraceMatchResult( + source_feature.id, + source_feature.geometry.wkt, + points, + source_feature_length, + len(target_candidates), + elapsed=elapsed, + sequence_breaks=sequence_breaks, + ) set_trace_match_metrics(t) return t + def set_trace_match_metrics(t: TraceMatchResult) -> None: matched_target_ids = set() route_length = 0 @@ -372,56 +559,108 @@ def set_trace_match_metrics(t: TraceMatchResult) -> None: revisited_segments = 0 points_with_matches = 0 for point in t.points: - if point.best_prediction is not None and point.best_prediction.referenced_feature is not None: + if ( + point.best_prediction is not None + and point.best_prediction.referenced_feature is not None + ): points_with_matches += 1 - route_length += point.best_prediction.route_distance_to_prev_point if point.best_prediction.route_distance_to_prev_point is not None else 0 + route_length += ( + point.best_prediction.route_distance_to_prev_point + if point.best_prediction.route_distance_to_prev_point is not None + else 0 + ) dist_to_road += point.best_prediction.distance_to_snapped_road - revisited_via_points += point.best_prediction.best_revisited_via_points_count + revisited_via_points += ( + point.best_prediction.best_revisited_via_points_count + ) revisited_segments += point.best_prediction.best_revisited_segments_count matched_target_ids.add(point.best_prediction.referenced_feature.id) t.matched_target_ids = list(matched_target_ids) t.points_with_matches = points_with_matches t.route_length = round(route_length, 2) - t.avg_dist_to_road = round(dist_to_road / points_with_matches, 2) if points_with_matches > 0 else None + t.avg_dist_to_road = ( + round(dist_to_road / points_with_matches, 2) + if points_with_matches > 0 + else None + ) t.revisited_via_points = revisited_via_points t.revisited_segments = revisited_segments -def print_stats(source_features: Iterable[MatchableFeature], target_features: Iterable[MatchableFeature], match_results: Iterable[TraceMatchResult], total_elapsed: float, avg_runtime_per_feature: float): + +def print_stats( + source_features: Iterable[MatchableFeature], + target_features: Iterable[MatchableFeature], + match_results: Iterable[TraceMatchResult], + total_elapsed: float, + avg_runtime_per_feature: float, +): num_traces = len(source_features) - total_route_length = sum([r.route_length for r in match_results]) / 1000 # in km - total_traces_length = sum([r.source_length for r in match_results]) / 1000 # in km + total_route_length = sum([r.route_length for r in match_results]) / 1000 # in km + total_traces_length = sum([r.source_length for r in match_results]) / 1000 # in km total_candidates = sum([r.target_candidates_count for r in match_results]) total_matches = sum([len(r.matched_target_ids) for r in match_results]) total_sequence_breaks = sum([r.sequence_breaks for r in match_results]) total_revisited_via_points = sum([r.revisited_via_points for r in match_results]) total_revisited_segments = sum([r.revisited_segments for r in match_results]) - total_traces_with_matches = sum([1 for r in match_results if r.points_with_matches > 0]) - total_avg_dist_to_road = sum([r.avg_dist_to_road for r in match_results if r.points_with_matches > 0]) - avg_runtime_per_km = total_elapsed / total_traces_length if total_traces_length > 0 else None - avg_dist_to_road = round(total_avg_dist_to_road / total_traces_with_matches, 2) if total_traces_with_matches > 0 else None + total_traces_with_matches = sum( + [1 for r in match_results if r.points_with_matches > 0] + ) + total_avg_dist_to_road = sum( + [r.avg_dist_to_road for r in match_results if r.points_with_matches > 0] + ) + avg_runtime_per_km = ( + total_elapsed / total_traces_length if total_traces_length > 0 else None + ) + avg_dist_to_road = ( + round(total_avg_dist_to_road / total_traces_with_matches, 2) + if total_traces_with_matches > 0 + else None + ) print("==================================================================") print("Totals:") print("==================================================================") print(rf"Traces.............................{num_traces}") print(rf"Target features....................{len(target_features)}") - print(rf"Elapsed:...........................{round(total_elapsed//60)}min {total_elapsed%60:.3f}s") + print( + rf"Elapsed:...........................{round(total_elapsed // 60)}min {total_elapsed % 60:.3f}s" + ) print(rf"Avg runtime/trace..................{avg_runtime_per_feature:.3f}s") print(rf"Avg runtime/km.....................{avg_runtime_per_km:.3f}s") print(rf"Avg distance to snapped road.......{avg_dist_to_road}m") print(rf"Snapped route length...............{total_route_length:.2f}km") print(rf"GPS traces length..................{total_traces_length:.2f}km") - print(rf"Snapped route len/gps len..........{(total_route_length/total_traces_length):.2f}") - print(rf"Avg number of candidate segments...{(total_candidates/num_traces):.2f}/trace, {(total_candidates/total_traces_length):.2f}/km") - print(rf"Avg number of matched segments.....{(total_matches/num_traces):.2f}/trace, {(total_matches/total_traces_length):.2f}/km") - print(rf"Avg number of sequence breaks......{(total_sequence_breaks/num_traces):.2f}/trace, {(total_sequence_breaks/total_traces_length):.2f}/km") - print(rf"Avg number of revisited via points.{(total_revisited_via_points/num_traces):.2f}/trace, {(total_revisited_via_points/total_traces_length):.2f}/km") - print(rf"Avg number of revisited segments...{(total_revisited_segments/num_traces):.2f}/trace, {(total_revisited_segments/total_traces_length):.2f}/km") + print( + rf"Snapped route len/gps len..........{(total_route_length / total_traces_length):.2f}" + ) + print( + rf"Avg number of candidate segments...{(total_candidates / num_traces):.2f}/trace, {(total_candidates / total_traces_length):.2f}/km" + ) + print( + rf"Avg number of matched segments.....{(total_matches / num_traces):.2f}/trace, {(total_matches / total_traces_length):.2f}/km" + ) + print( + rf"Avg number of sequence breaks......{(total_sequence_breaks / num_traces):.2f}/trace, {(total_sequence_breaks / total_traces_length):.2f}/km" + ) + print( + rf"Avg number of revisited via points.{(total_revisited_via_points / num_traces):.2f}/trace, {(total_revisited_via_points / total_traces_length):.2f}/km" + ) + print( + rf"Avg number of revisited segments...{(total_revisited_segments / num_traces):.2f}/trace, {(total_revisited_segments / total_traces_length):.2f}/km" + ) print("==================================================================") -def snap_traces(features_to_match_file: str, overture_file: str, output_file: str, res: int, snap_options: TraceSnapOptions=None, output_for_judgment: bool=False) -> None: + +def snap_traces( + features_to_match_file: str, + overture_file: str, + output_file: str, + res: int, + snap_options: TraceSnapOptions = None, + output_for_judgment: bool = False, +) -> None: if snap_options is None: - snap_options = TraceSnapOptions() # loads default options + snap_options = TraceSnapOptions() # loads default options # save the options we used next to the output file for debugging or comparison with other runs with open(output_file + ".options.json", "w") as f: @@ -430,19 +669,21 @@ def snap_traces(features_to_match_file: str, overture_file: str, output_file: st start = timer() print("Loading features...") to_match_prop_filter = {} - #to_match_prop_filter["id"] = "manual_trace#4" + # to_match_prop_filter["id"] = "manual_trace#4" to_match = load_matchable_set(features_to_match_file, is_multiline=False, res=res) features_to_match = to_match.features_by_id.values() if len(features_to_match) == 0: print("no features to match") exit() - overture = load_matchable_set(overture_file, is_multiline=True, properties_filter = {"type": "segment"}, res=res) - features_overture =overture.features_by_id.values() + overture = load_matchable_set( + overture_file, is_multiline=True, properties_filter={"type": "segment"}, res=res + ) + features_overture = overture.features_by_id.values() print("Features to match: " + str(len(features_to_match))) print("Features Overture: " + str(len(features_overture))) end = timer() - print(f"Loading time: {(end-start):.2f}s") + print(f"Loading time: {(end - start):.2f}s") i = 0 match_results = [] @@ -450,46 +691,142 @@ def snap_traces(features_to_match_file: str, overture_file: str, output_file: st for source_feature in features_to_match: i += 1 - target_candidates = get_features_with_cells(overture.features_by_cell, to_match.cells_by_id[source_feature.id]) + target_candidates = get_features_with_cells( + overture.features_by_cell, to_match.cells_by_id[source_feature.id] + ) match_res = get_trace_matches(source_feature, target_candidates, snap_options) match_results.append(match_res) total_elapsed += match_res.elapsed avg_runtime_per_feature = total_elapsed / i - if i%1 == 0: - print(rf"trace#{str(i)} length={match_res.source_length} route_length={round(match_res.route_length)} " + \ - rf"points={len(source_feature.geometry.coords)} points_w_matches={match_res.points_with_matches} " + \ - rf"candidates={match_res.target_candidates_count} matched target_ids: {str(len(match_res.matched_target_ids))} " + \ - rf"elapsed: {match_res.elapsed:.2f}s; avg runtime/feature: {avg_runtime_per_feature:.3f}s") - - print_stats(features_to_match, features_overture, match_results, total_elapsed, avg_runtime_per_feature) + if i % 1 == 0: + print( + rf"trace#{str(i)} length={match_res.source_length} route_length={round(match_res.route_length)} " + + rf"points={len(source_feature.geometry.coords)} points_w_matches={match_res.points_with_matches} " + + rf"candidates={match_res.target_candidates_count} matched target_ids: {str(len(match_res.matched_target_ids))} " + + rf"elapsed: {match_res.elapsed:.2f}s; avg runtime/feature: {avg_runtime_per_feature:.3f}s" + ) + + print_stats( + features_to_match, + features_overture, + match_results, + total_elapsed, + avg_runtime_per_feature, + ) print("Writing results...") start = timer() output_trace_snap_results(match_results, output_file, output_for_judgment) end = timer() - print(f"Writing time: {(end-start):.2f}s") - calculate_error_rate(features_to_match_file.replace('.geojson', '.labeled.txt'), overture.features_by_id, match_results) + print(f"Writing time: {(end - start):.2f}s") + calculate_error_rate( + features_to_match_file.replace(".geojson", ".labeled.txt"), + overture.features_by_id, + match_results, + ) + def get_args(): - parser = argparse.ArgumentParser(description="", add_help=True, formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--input-to-match", help="Input file containing features to match in geojson format", required=True) - parser.add_argument("--input-overture", help="Input file containing overture features", required=True) - parser.add_argument("--output", help="Output file containing match results", required=True) - parser.add_argument("--resolution", help="H3 cell resolution used to pre-filter candidates", type=int, default=constants.DEFAULT_H3_RESOLUTION, choices=range(0,15)) - parser.add_argument("--sigma", type=float, help=f"Sigma param - controlling tolerance to GPS noise", required=False, default=constants.DEFAULT_SIGMA) - parser.add_argument("--beta", type=float, help=f"Beta param - controlling confidence in route", required=False, default=constants.DEFAULT_BETA) - parser.add_argument("--allow_loops", type=bool, help=f"Allow same sequence to revisit same segment with other segment(s) in between", required=False, default=constants.DEFAULT_ALLOW_LOOPS) - parser.add_argument("--max_point_to_road_distance", type=float, help=f"Maximum distance in meters between a trace point and a match candidate road", required=False, default=constants.DEFAULT_MAX_POINT_TO_ROAD_DISTANCE) - parser.add_argument("--max_route_to_trace_distance_difference", type=float, help=f"Maximum difference between route and trace lengths in meters", required=False, default=constants.DEFAULT_MAX_ROUTE_TO_TRACE_DISTANCE_DIFFERENCE) - parser.add_argument("--revisit_segment_penalty_weight", type=float, help="How much to penalize a route with one segment revisit", required=False, default=constants.DEFAULT_SEGMENT_REVISIT_PENALTY) - parser.add_argument("--revisit_via_point_penalty_weight", type=float, help="How much to penalize a route with one via-point revisit", required=False, default=constants.DEFAULT_VIA_POINT_PENALTY_WEIGHT) - parser.add_argument("--broken_time_gap_reset_sequence", type=float, help="How big the time gap in seconds between points without valid route options before we consider it a broken sequence", required=False, default=constants.DEFAULT_BROKEN_TIME_GAP_RESET_SEQUENCE) - parser.add_argument("--broken_distance_gap_reset_sequence", type=float, help="How big the distance gap in meters between points without valid route options before we consider it a broken sequence", required=False, default=constants.DEFAULT_BROKEN_DISTANCE_GAP_RESET_SEQUENCE) - parser.add_argument("--j", action="store_true", help="Also output the matches as a 'pre-labeled' file for judgment", default=False, required=False) + parser = argparse.ArgumentParser( + description="", + add_help=True, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input-to-match", + help="Input file containing features to match in geojson format", + required=True, + ) + parser.add_argument( + "--input-overture", + help="Input file containing overture features", + required=True, + ) + parser.add_argument( + "--output", help="Output file containing match results", required=True + ) + parser.add_argument( + "--resolution", + help="H3 cell resolution used to pre-filter candidates", + type=int, + default=constants.DEFAULT_H3_RESOLUTION, + choices=range(0, 15), + ) + parser.add_argument( + "--sigma", + type=float, + help="Sigma param - controlling tolerance to GPS noise", + required=False, + default=constants.DEFAULT_SIGMA, + ) + parser.add_argument( + "--beta", + type=float, + help="Beta param - controlling confidence in route", + required=False, + default=constants.DEFAULT_BETA, + ) + parser.add_argument( + "--allow_loops", + type=bool, + help="Allow same sequence to revisit same segment with other segment(s) in between", + required=False, + default=constants.DEFAULT_ALLOW_LOOPS, + ) + parser.add_argument( + "--max_point_to_road_distance", + type=float, + help="Maximum distance in meters between a trace point and a match candidate road", + required=False, + default=constants.DEFAULT_MAX_POINT_TO_ROAD_DISTANCE, + ) + parser.add_argument( + "--max_route_to_trace_distance_difference", + type=float, + help="Maximum difference between route and trace lengths in meters", + required=False, + default=constants.DEFAULT_MAX_ROUTE_TO_TRACE_DISTANCE_DIFFERENCE, + ) + parser.add_argument( + "--revisit_segment_penalty_weight", + type=float, + help="How much to penalize a route with one segment revisit", + required=False, + default=constants.DEFAULT_SEGMENT_REVISIT_PENALTY, + ) + parser.add_argument( + "--revisit_via_point_penalty_weight", + type=float, + help="How much to penalize a route with one via-point revisit", + required=False, + default=constants.DEFAULT_VIA_POINT_PENALTY_WEIGHT, + ) + parser.add_argument( + "--broken_time_gap_reset_sequence", + type=float, + help="How big the time gap in seconds between points without valid route options before we consider it a broken sequence", + required=False, + default=constants.DEFAULT_BROKEN_TIME_GAP_RESET_SEQUENCE, + ) + parser.add_argument( + "--broken_distance_gap_reset_sequence", + type=float, + help="How big the distance gap in meters between points without valid route options before we consider it a broken sequence", + required=False, + default=constants.DEFAULT_BROKEN_DISTANCE_GAP_RESET_SEQUENCE, + ) + parser.add_argument( + "--j", + action="store_true", + help="Also output the matches as a 'pre-labeled' file for judgment", + default=False, + required=False, + ) return parser.parse_args() + def get_trace_snap_options_from_args(args): return TraceSnapOptions( sigma=args.sigma, @@ -500,9 +837,18 @@ def get_trace_snap_options_from_args(args): revisit_segment_penalty_weight=args.revisit_segment_penalty_weight, revisit_via_point_penalty_weight=args.revisit_via_point_penalty_weight, broken_time_gap_reset_sequence=args.broken_time_gap_reset_sequence, - broken_distance_gap_reset_sequence=args.broken_distance_gap_reset_sequence) + broken_distance_gap_reset_sequence=args.broken_distance_gap_reset_sequence, + ) + if __name__ == "__main__": args = get_args() trace_snap_options = get_trace_snap_options_from_args(args) - snap_traces(args.input_to_match, args.input_overture, args.output, args.resolution, trace_snap_options, output_for_judgment=args.j) + snap_traces( + args.input_to_match, + args.input_overture, + args.output, + args.resolution, + trace_snap_options, + output_for_judgment=args.j, + ) diff --git a/gers/examples/python/route_utils.py b/gers/examples/python/route_utils.py index 51cbe71bc..ea497b8a0 100644 --- a/gers/examples/python/route_utils.py +++ b/gers/examples/python/route_utils.py @@ -1,10 +1,20 @@ -from utils import get_distance -from shapely.ops import nearest_points -from match_classes import RouteStep, Route, MatchableFeature +from collections.abc import Iterable + +from match_classes import MatchableFeature, Route, RouteStep from shapely.geometry import Point -from typing import Dict, Tuple, Iterable +from shapely.ops import nearest_points +from utils import get_distance -def get_route_step_dist(feat_before_from: MatchableFeature, feat_from: MatchableFeature, feat_to: MatchableFeature, start_feature: MatchableFeature, end_feature: MatchableFeature, start_point: Point, end_point: Point) -> Tuple[Point, float]: + +def get_route_step_dist( + feat_before_from: MatchableFeature, + feat_from: MatchableFeature, + feat_to: MatchableFeature, + start_feature: MatchableFeature, + end_feature: MatchableFeature, + start_point: Point, + end_point: Point, +) -> tuple[Point, float]: """get distance traveled on one feature `feat_from` having entering from `feat_before_from` and exiting to `feat_to`, given that the whole route starts at `start_feature` and ends at `end_feature`""" # todo: this a distance approximation for now as length of straight line from entry point to exit point on the feat_from feature, but works reasonably well for the data seen so far feat_from_exit_point, p2 = nearest_points(feat_from.geometry, feat_to.geometry) @@ -13,9 +23,11 @@ def get_route_step_dist(feat_before_from: MatchableFeature, feat_from: Matchable if feat_from.id == start_feature.id: d += get_distance(start_point, feat_from_exit_point) else: - p0_before, feat_from_entry_point = nearest_points(feat_before_from.geometry, feat_from.geometry) + p0_before, feat_from_entry_point = nearest_points( + feat_before_from.geometry, feat_from.geometry + ) d += get_distance(feat_from_entry_point, feat_from_exit_point) - + if feat_to.id == end_feature.id: d += get_distance(end_point, p2) # else there is no distance to add @@ -23,11 +35,21 @@ def get_route_step_dist(feat_before_from: MatchableFeature, feat_from: Matchable # todo: add basic penalties like allowed travel direction disagreement, road class change cost, etc. return feat_from_exit_point, d -def get_shortest_route(features: Iterable[MatchableFeature], feature_id_to_connected_features: Dict[str, Iterable[MatchableFeature]], start_feature: MatchableFeature, end_feature: MatchableFeature, start_point: Point, end_point: Point, allowed_ids: Iterable[str], blocked_ids: Iterable[str]) -> Route: + +def get_shortest_route( + features: Iterable[MatchableFeature], + feature_id_to_connected_features: dict[str, Iterable[MatchableFeature]], + start_feature: MatchableFeature, + end_feature: MatchableFeature, + start_point: Point, + end_point: Point, + allowed_ids: Iterable[str], + blocked_ids: Iterable[str], +) -> Route: """ Dijsktra's algorithm to find shortest route between start and end features. Remember for each traveled feature the entry via_point. """ - + # start and end are same feature, no route calculation needed, just distance if start_feature.id == end_feature.id: dist = get_distance(start_point, end_point) @@ -42,7 +64,7 @@ def get_shortest_route(features: Iterable[MatchableFeature], feature_id_to_conne for f in features: if f.id in blocked_ids and f.id != start_feature.id: continue - dist[f.id] = float('inf') + dist[f.id] = float("inf") prev[f.id] = None prev_via_point[f.id] = None feats_to_visit.append(f) @@ -51,41 +73,55 @@ def get_shortest_route(features: Iterable[MatchableFeature], feature_id_to_conne while len(feats_to_visit) > 0: current_feature = feats_to_visit[0] - min_dist = float('inf') + min_dist = float("inf") for f in feats_to_visit: if dist[f.id] < min_dist: min_dist = dist[f.id] current_feature = f - if min_dist == float('inf'): - break # no more allowed connected features to visit + if min_dist == float("inf"): + break # no more allowed connected features to visit if current_feature.id == end_feature.id: - break # done, visited end_feature, don't need to calculate shortest path to all features + break # done, visited end_feature, don't need to calculate shortest path to all features feats_to_visit.remove(current_feature) ids_to_visit.remove(current_feature.id) connected_features = feature_id_to_connected_features[current_feature.id] for v in connected_features: - if not(v.id in allowed_ids) or (v.id in blocked_ids) or not(v.id in ids_to_visit): + if ( + v.id not in allowed_ids + or (v.id in blocked_ids) + or v.id not in ids_to_visit + ): continue - if not(v.id in ids_to_visit): - continue # have already visited this feature + if v.id not in ids_to_visit: + continue # have already visited this feature - via_point, d = get_route_step_dist(prev[current_feature.id], current_feature, v, start_feature, end_feature, start_point, end_point) + via_point, d = get_route_step_dist( + prev[current_feature.id], + current_feature, + v, + start_feature, + end_feature, + start_point, + end_point, + ) alternate_dist = dist[current_feature.id] + d if alternate_dist < dist[v.id]: dist[v.id] = alternate_dist prev[v.id] = current_feature prev_via_point[v.id] = via_point - + steps = [] current_feature = end_feature if prev[current_feature.id] is not None or current_feature.id == start_feature.id: while current_feature is not None: - steps.insert(0, RouteStep(current_feature, prev_via_point[current_feature.id])) + steps.insert( + 0, RouteStep(current_feature, prev_via_point[current_feature.id]) + ) current_feature = prev[current_feature.id] r = Route(round(dist[end_feature.id], 2), steps) - return r \ No newline at end of file + return r diff --git a/gers/examples/python/tests/match_traces_test.py b/gers/examples/python/tests/match_traces_test.py index 07cd77a8b..90a1b6b3f 100644 --- a/gers/examples/python/tests/match_traces_test.py +++ b/gers/examples/python/tests/match_traces_test.py @@ -1,20 +1,26 @@ -import test_setup -import os import json +import os import unittest + import constants from match_classes import TraceSnapOptions from match_traces import get_trace_matches -from utils import load_matchable_set, get_features_with_cells +from utils import get_features_with_cells, load_matchable_set -class TestTraces(unittest.TestCase): +class TestTraces(unittest.TestCase): def test_match_traces(self): - features_to_match_file = os.path.join(constants.DATA_DIR, "macon-manual-traces.geojson") - overture_file = os.path.join(constants.DATA_DIR, "overture-transportation-macon.geojson") + features_to_match_file = os.path.join( + constants.DATA_DIR, "macon-manual-traces.geojson" + ) + overture_file = os.path.join( + constants.DATA_DIR, "overture-transportation-macon.geojson" + ) res = 12 - to_match = load_matchable_set(features_to_match_file, is_multiline=False, res=res) + to_match = load_matchable_set( + features_to_match_file, is_multiline=False, res=res + ) self.assertIsNotNone(to_match) self.assertEqual(len(to_match.features_by_id), 4) @@ -22,19 +28,26 @@ def test_match_traces(self): self.assertIn(id_to_match, to_match.features_by_id) source_feature = to_match.features_by_id[id_to_match] - overture = load_matchable_set(overture_file, is_multiline=True, properties_filter = {"type": "segment"}, res=res) + overture = load_matchable_set( + overture_file, + is_multiline=True, + properties_filter={"type": "segment"}, + res=res, + ) self.assertIsNotNone(overture.features_by_id) self.assertGreater(len(overture.features_by_id), 20000) options = TraceSnapOptions(max_point_to_road_distance=30) - target_candidates = get_features_with_cells(overture.features_by_cell, to_match.cells_by_id[source_feature.id]) + target_candidates = get_features_with_cells( + overture.features_by_cell, to_match.cells_by_id[source_feature.id] + ) match_res = get_trace_matches(source_feature, target_candidates, options) self.assertIsNotNone(match_res) self.assertIsNotNone(match_res.points) self.assertEqual(len(match_res.points), len(source_feature.geometry.coords)) self.assertGreater(match_res.source_length, 5000) - self.assertGreater(match_res.route_length, 5000) + self.assertGreater(match_res.route_length, 5000) json_res = match_res.to_json() self.assertIsNotNone(json_res) @@ -48,5 +61,6 @@ def test_match_traces(self): if idx > 0: self.assertGreater(bp.route_distance_to_prev_point, 0.0) -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/gers/examples/python/tests/test_setup.py b/gers/examples/python/tests/test_setup.py index 457368324..2bf5cc41a 100644 --- a/gers/examples/python/tests/test_setup.py +++ b/gers/examples/python/tests/test_setup.py @@ -1,5 +1,5 @@ -import sys import os +import sys parent_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, os.path.dirname(parent_dir)) \ No newline at end of file +sys.path.insert(0, os.path.dirname(parent_dir)) diff --git a/gers/examples/python/tests/utils_test.py b/gers/examples/python/tests/utils_test.py index 22e82586b..6733e4999 100644 --- a/gers/examples/python/tests/utils_test.py +++ b/gers/examples/python/tests/utils_test.py @@ -1,14 +1,21 @@ -import test_setup import os import unittest + import constants -from utils import get_distance, get_linestring_length, get_intersecting_h3_cells_for_geo_json, load_matchable_set -from shapely import Point, LineString +from shapely import LineString, Point +from utils import ( + get_distance, + get_intersecting_h3_cells_for_geo_json, + get_linestring_length, + load_matchable_set, +) -class TestUtils(unittest.TestCase): +class TestUtils(unittest.TestCase): def test_load_matchable_set_geojson(self): - features_to_match_file = os.path.join(constants.DATA_DIR, "macon-manual-traces.geojson") + features_to_match_file = os.path.join( + constants.DATA_DIR, "macon-manual-traces.geojson" + ) s = load_matchable_set(features_to_match_file, res=12, is_multiline=False) self.assertIsNotNone(s) self.assertEqual(len(s.features_by_id), 4) @@ -18,35 +25,60 @@ def test_load_matchable_set_geojson(self): def test_get_distance(self): p1 = Point(-83.6878343, 32.8413587) p2 = Point(-83.6877941, 32.8413903) - + d = get_distance(p1, p2) self.assertAlmostEqual(d, 5.1, delta=0.1) def test_get_linestring_length(self): l = LineString([(-83.6878343, 32.8413587), (-83.6877941, 32.8413903)]) - + d = get_linestring_length(l) - self.assertAlmostEqual(d, 5.1, delta=0.1) + self.assertAlmostEqual(d, 5.1, delta=0.1) def test_get_intersecting_h3_cells_for_geo_json(self): - point = { "type": "Point", "coordinates": [-83.6197063, 32.8589311] } + point = {"type": "Point", "coordinates": [-83.6197063, 32.8589311]} actual_cells = get_intersecting_h3_cells_for_geo_json(point, 10) expected_cells = ["8a44c0a32867fff"] self.assertCountEqual(actual_cells, expected_cells) - line = { "type": "LineString", "coordinates": [[-83.61940200000001, 32.858034], [-83.61940200000001, 32.859538]] } + line = { + "type": "LineString", + "coordinates": [ + [-83.61940200000001, 32.858034], + [-83.61940200000001, 32.859538], + ], + } actual_cells = get_intersecting_h3_cells_for_geo_json(line, 10) expected_cells = ["8a44c0a3295ffff", "8a44c0a32867fff"] self.assertCountEqual(actual_cells, expected_cells) - polygon = { "type": "Polygon", "coordinates": [[[-83.6195695, 32.8591587], [-83.6192584, 32.8583723], [-83.619135, 32.8590437], [-83.6195695, 32.8591587]]] } + polygon = { + "type": "Polygon", + "coordinates": [ + [ + [-83.6195695, 32.8591587], + [-83.6192584, 32.8583723], + [-83.619135, 32.8590437], + [-83.6195695, 32.8591587], + ] + ], + } actual_cells = get_intersecting_h3_cells_for_geo_json(polygon, 10) expected_cells = ["8a44c0a32877fff", "8a44c0a32867fff", "8a44c0a3295ffff"] self.assertCountEqual(actual_cells, expected_cells) - ml = { "type": "MultiLineString", "coordinates": [[[-83.61940200000001, 32.858034], [-83.61940200000001, 32.859538]], [[-83.6215878, 32.8580366], [-83.6202145, 32.8580546]]] } + ml = { + "type": "MultiLineString", + "coordinates": [ + [[-83.61940200000001, 32.858034], [-83.61940200000001, 32.859538]], + [[-83.6215878, 32.8580366], [-83.6202145, 32.8580546]], + ], + } actual_cells = get_intersecting_h3_cells_for_geo_json(ml, 10) - expected_cells = ["8a44c0a3294ffff", "8a44c0a32867fff", "8a44c0a304b7fff", "8a44c0a3295ffff"] + expected_cells = [ + "8a44c0a3294ffff", + "8a44c0a32867fff", + "8a44c0a304b7fff", + "8a44c0a3295ffff", + ] self.assertCountEqual(actual_cells, expected_cells) - - diff --git a/gers/examples/python/utils.py b/gers/examples/python/utils.py index c66599f24..7d613b19a 100644 --- a/gers/examples/python/utils.py +++ b/gers/examples/python/utils.py @@ -1,116 +1,142 @@ import csv import json -from haversine import haversine, Unit -#from shapely.ops import transform -from shapely import wkt -from shapely.geometry import shape, mapping -from shapely.geometry.base import BaseGeometry +from collections.abc import Iterable +from typing import Any + from dateutil import parser -from typing import Any, Dict, Iterable from h3 import h3 - +from haversine import Unit, haversine from match_classes import MatchableFeature, MatchableFeaturesSet -#from pyproj import Geod + +# from shapely.ops import transform +from shapely import wkt +from shapely.geometry import mapping, shape +from shapely.geometry.base import BaseGeometry + +# from pyproj import Geod + def get_seconds_elapsed(t1_str, t2_str): t1 = parser.parse(t1_str) t2 = parser.parse(t2_str) return (t2 - t1).total_seconds() + def get_linestring_length(ls): length = 0 for i in range(len(ls.coords) - 1): lon1, lat1 = ls.coords[i] - lon2, lat2 = ls.coords[i+1] - #_, _, d = geod.inv(lon1, lat1, lon2, lat2) + lon2, lat2 = ls.coords[i + 1] + # _, _, d = geod.inv(lon1, lat1, lon2, lat2) d = haversine((lat1, lon1), (lat2, lon2), unit=Unit.METERS) - length += d + length += d return round(length, 2) + def get_distance(point1, point2): - #_, _, d = geod.inv(point1.x, point1.y, point2.x, point2.y) + # _, _, d = geod.inv(point1.x, point1.y, point2.x, point2.y) d = haversine((point1.y, point1.x), (point2.y, point2.x), unit=Unit.METERS) return round(d, 2) + def get_intersecting_h3_cells_for_line(coords, res): """for coordinates of a linestring, gets all h3 cells of given resolution that intersect the line""" - cells = set() - prevCell = None + cells = set() + prevCell = None for coord in coords: cell = h3.geo_to_h3(coord[1], coord[0], res) cells.add(cell) - if (prevCell is None): + if prevCell is None: prevCell = cell else: - if (prevCell != cell): + if prevCell != cell: # two consecutive coordinates in the linestring may be more than one cell apart # need to find intermediate cells between previous cell and the current one - if (not h3.h3_indexes_are_neighbors(prevCell, cell)): + if not h3.h3_indexes_are_neighbors(prevCell, cell): intermediateCells = h3.h3_line(prevCell, cell) for intermediateCell in intermediateCells: cells.add(intermediateCell) prevCell = cell return cells -def get_intersecting_h3_cells_for_geo_json(geometry: Any, res:int) -> Iterable[str]: + +def get_intersecting_h3_cells_for_geo_json(geometry: Any, res: int) -> Iterable[str]: """gets all h3 cells of given resolution that intersect the geometry.""" # h3 api wants two floats for point, geojson dict for polygon and custom code is needed for line and multi* geometries geojson = mapping(geometry) if isinstance(geometry, BaseGeometry) else geometry geom_type = geojson["type"] coords = geojson["coordinates"] - if (geom_type.startswith("Multi")): + if geom_type.startswith("Multi"): sub_geom_type = geom_type.replace("Multi", "") - sub_geoms = [{"type": sub_geom_type, "coordinates": sub_geom_coords } for sub_geom_coords in coords] - sub_cells = [sub_cell for sub_geom in sub_geoms for sub_cell in get_intersecting_h3_cells_for_geo_json(sub_geom, res)] + sub_geoms = [ + {"type": sub_geom_type, "coordinates": sub_geom_coords} + for sub_geom_coords in coords + ] + sub_cells = [ + sub_cell + for sub_geom in sub_geoms + for sub_cell in get_intersecting_h3_cells_for_geo_json(sub_geom, res) + ] return set(sub_cells) - if (geom_type == "Point"): + if geom_type == "Point": return set([h3.geo_to_h3(coords[1], coords[0], res)]) - if (geom_type == "LineString"): + if geom_type == "LineString": return get_intersecting_h3_cells_for_line(coords, res) - if (geom_type == "Polygon"): - innerCells = h3.polyfill(geojson, res, True) # this only covers the tiles whose centers are inside the polygon + if geom_type == "Polygon": + innerCells = h3.polyfill( + geojson, res, True + ) # this only covers the tiles whose centers are inside the polygon boundaryCells = get_intersecting_h3_cells_for_line(coords[0], res) return innerCells | boundaryCells -def matches_properties_filter(feature: Dict[str, Any], properties_filter: Dict[str, Any]) -> bool: + +def matches_properties_filter( + feature: dict[str, Any], properties_filter: dict[str, Any] +) -> bool: if properties_filter is None: return True feat_props = feature.get("properties") for prop in properties_filter: if prop == "id": return feature.get("id") == properties_filter[prop] - - if not prop in feat_props or (properties_filter[prop] != "*" and feat_props[prop] != properties_filter[prop]): + + if prop not in feat_props or ( + properties_filter[prop] != "*" + and feat_props[prop] != properties_filter[prop] + ): return False - return True + return True + -def get_matchable_feature(feature_dict: Dict[str, Any]) -> MatchableFeature: +def get_matchable_feature(feature_dict: dict[str, Any]) -> MatchableFeature: """creates a MatchableFeature from a dict with expected keys [id, geometry, properties], which could be either a geojson or parsed from a csv file with wkt geometry""" id = feature_dict.get("id") - geom = feature_dict.get("geometry") + geom = feature_dict.get("geometry") if type(geom) is dict and "type" in geom and "coordinates" in geom: # if it"s a geojson feature - s = shape(geom) - elif isinstance(geom, str): + s = shape(geom) + elif isinstance(geom, str): # if it"s a wkt string s = wkt.loads(geom) - props = feature_dict.get("properties") + props = feature_dict.get("properties") return MatchableFeature(id, s, props) -def get_feature_cells(geom: Any, res: int, k_rings_to_add:int=1): + +def get_feature_cells(geom: Any, res: int, k_rings_to_add: int = 1): """gets all h3 cells of given resolution that intersect the geometry, and also the cells that are k rings around the intersecting cells""" h3_cells = get_intersecting_h3_cells_for_geo_json(geom, res) if k_rings_to_add == 0: return list(h3_cells) - + rings = [h3.k_ring(h, k_rings_to_add) for h in h3_cells] return list(set(cell for r in rings for cell in r)) -def parse_geojson(filename: str, is_multiline: bool) -> Iterable[Dict[str, Any]]: - with open(filename, mode="r", errors="ignore") as file: + +def parse_geojson(filename: str, is_multiline: bool) -> Iterable[dict[str, Any]]: + with open(filename, errors="ignore") as file: if is_multiline: # text file with one geojson per line - i=0 + i = 0 features = [] for line in file: i += 1 @@ -118,16 +144,22 @@ def parse_geojson(filename: str, is_multiline: bool) -> Iterable[Dict[str, Any]] geojson = json.loads(line.strip().rstrip(",")) features.append(geojson) except Exception as x: - print(fr"Line {i}: " + str(x)) + print(rf"Line {i}: " + str(x)) return features else: full_gj = json.loads(file.read()) - if full_gj.get("type") == "FeatureCollection": + if full_gj.get("type") == "FeatureCollection": return full_gj.get("features") else: return [full_gj] -def get_matchable_set(features: Iterable[Dict[str, Any]], properties_filter: dict=None, res: int=12, limit_feature_count=-1) -> MatchableFeaturesSet: + +def get_matchable_set( + features: Iterable[dict[str, Any]], + properties_filter: dict = None, + res: int = 12, + limit_feature_count=-1, +) -> MatchableFeaturesSet: features_by_id = {} cells_by_id = {} features_by_cell = {} @@ -140,7 +172,7 @@ def get_matchable_set(features: Iterable[Dict[str, Any]], properties_filter: dic features_by_id[feature.id] = feature cells_by_id[feature.id] = get_feature_cells(feature.geometry, res) for cell in cells_by_id[feature.id]: - if not cell in features_by_cell: + if cell not in features_by_cell: features_by_cell[cell] = [] features_by_cell[cell].append(feature) except Exception as x: @@ -150,11 +182,12 @@ def get_matchable_set(features: Iterable[Dict[str, Any]], properties_filter: dic break return MatchableFeaturesSet(features_by_id, cells_by_id, features_by_cell) -def parse_csv(filename: str, delimiter: str=",") -> MatchableFeaturesSet: + +def parse_csv(filename: str, delimiter: str = ",") -> MatchableFeaturesSet: features = [] - i=0 - with open(filename, mode="r", errors="ignore") as file: - reader = csv.DictReader(file, delimiter=delimiter) + i = 0 + with open(filename, errors="ignore") as file: + reader = csv.DictReader(file, delimiter=delimiter) for row in reader: feat_dict = {} feat_dict["properties"] = {} @@ -162,28 +195,43 @@ def parse_csv(filename: str, delimiter: str=",") -> MatchableFeaturesSet: default_id = str(i) i += 1 key = k.lower() - if not "id" in feat_dict and "id" in key: + if "id" not in feat_dict and "id" in key: feat_dict["id"] = v - elif not "geometry" in feat_dict and ("geometry" in key or "wkt" in key): + elif "geometry" not in feat_dict and ( + "geometry" in key or "wkt" in key + ): feat_dict["geometry"] = v else: feat_dict["properties"][k] = v - if not "id" in feat_dict: + if "id" not in feat_dict: feat_dict["id"] = default_id - if not "geometry" in feat_dict: - if "lat" in feat_dict["properties"] and "lon" in feat_dict["properties"]: - feat_dict["geometry"] = f'POINT({feat_dict["properties"]["lon"]} {feat_dict["properties"]["lat"]})' + if "geometry" not in feat_dict: + if ( + "lat" in feat_dict["properties"] + and "lon" in feat_dict["properties"] + ): + feat_dict["geometry"] = ( + f"POINT({feat_dict['properties']['lon']} {feat_dict['properties']['lat']})" + ) else: continue - + features.append(feat_dict) return features -def load_matchable_set(filename: str, properties_filter: dict=None, res: int=12, limit_feature_count=-1, is_multiline: bool=False, delimiter: str=",") -> MatchableFeaturesSet: + +def load_matchable_set( + filename: str, + properties_filter: dict = None, + res: int = 12, + limit_feature_count=-1, + is_multiline: bool = False, + delimiter: str = ",", +) -> MatchableFeaturesSet: """loads a MatchableFeaturesSet from a geojson or csv file""" - extension = filename.split(".")[-1] + extension = filename.split(".")[-1] match extension: case "geojson" | "json": features = parse_geojson(filename, is_multiline=is_multiline) @@ -191,22 +239,26 @@ def load_matchable_set(filename: str, properties_filter: dict=None, res: int=12, features = parse_csv(filename, delimiter=delimiter) case _: raise Exception(f"Unsupported file type: {extension}") - + s = get_matchable_set(features, properties_filter, res, limit_feature_count) return s - -def get_features_with_cells(features_by_cell: Dict[str, Iterable[MatchableFeature]], cells_filter: Iterable[str]) -> Iterable[MatchableFeature]: + + +def get_features_with_cells( + features_by_cell: dict[str, Iterable[MatchableFeature]], cells_filter: Iterable[str] +) -> Iterable[MatchableFeature]: """gets all features in `features_by_cell` that intersect any of the cells in `cells_filter`""" with_cells = [] candidate_ids = set() for cell in cells_filter: if cell in features_by_cell: for candidate in features_by_cell[cell]: - if not candidate.id in candidate_ids: + if candidate.id not in candidate_ids: candidate_ids.add(candidate.id) with_cells.append(candidate) return with_cells + def write_json(results_json: Any, output_file_name: str): with open(output_file_name, "w") as f: - json.dump(results_json, f, indent=4) \ No newline at end of file + json.dump(results_json, f, indent=4) diff --git a/packages/overture-schema-divisions-theme/pyproject.toml b/packages/overture-schema-divisions-theme/pyproject.toml index e56b1df6a..565e30ba4 100644 --- a/packages/overture-schema-divisions-theme/pyproject.toml +++ b/packages/overture-schema-divisions-theme/pyproject.toml @@ -4,7 +4,7 @@ dependencies = [ "overture-schema-system", "pydantic>=2.0", ] -description = "Overture Maps divisions theme shared structures, division, division area and division boundary types" +description = "Overture Maps divisions theme shared structures, division, division area, division boundary, and geographic area types" dynamic = ["version"] license = "MIT" name = "overture-schema-divisions-theme" @@ -29,3 +29,4 @@ packages = ["src/overture"] "overture:divisions:division" = "overture.schema.divisions:Division" "overture:divisions:division_area" = "overture.schema.divisions:DivisionArea" "overture:divisions:division_boundary" = "overture.schema.divisions:DivisionBoundary" +"overture:divisions:geographic_area" = "overture.schema.divisions:GeographicArea" diff --git a/packages/overture-schema-divisions-theme/src/overture/schema/divisions/__init__.py b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/__init__.py index ea08ff2e5..7b3cc025a 100644 --- a/packages/overture-schema-divisions-theme/src/overture/schema/divisions/__init__.py +++ b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/__init__.py @@ -9,5 +9,6 @@ from .division import Division from .division_area import DivisionArea from .division_boundary import DivisionBoundary +from .geographic_area import GeographicArea -__all__ = ["Division", "DivisionArea", "DivisionBoundary"] +__all__ = ["Division", "DivisionArea", "DivisionBoundary", "GeographicArea"] diff --git a/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/__init__.py b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/__init__.py new file mode 100644 index 000000000..2a19485b6 --- /dev/null +++ b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/__init__.py @@ -0,0 +1,6 @@ +"""Geographic area feature type.""" + +from .enums import GeographicAreaClass, GeographicAreaSubtype +from .models import GeographicArea + +__all__ = ["GeographicArea", "GeographicAreaClass", "GeographicAreaSubtype"] diff --git a/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/enums.py b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/enums.py new file mode 100644 index 000000000..3c8b7ea33 --- /dev/null +++ b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/enums.py @@ -0,0 +1,37 @@ +"""Geography enums for Overture Maps divisions theme.""" + +from enum import Enum + + +class GeographicAreaSubtype(str, Enum): + """ + The type of geographic area feature. + + - functional: Regions defined by functional characteristics or usage patterns + (e.g., postal codes, economic zones). + + - cultural: Regions defined by cultural identity, colloquial usage, or shared + cultural characteristics (e.g., "East Asia", "California Wine Country"). + """ + + FUNCTIONAL = "functional" + CULTURAL = "cultural" + + +class GeographicAreaClass(str, Enum): + """ + Classification of the geographic area feature. + """ + + # Colloquial regions are informal, culturally defined, or commonly referenced areas + # that do not correspond to official administrative boundaries. Unlike countries, + # states, counties, or cities—whose boundaries are legally defined—colloquial regions + # evolve from cultural, historical, economic, or linguistic identity. + # Examples include South Florida, East Asia, and California Wine Country. + # Only applicable to cultural subtype. + COLLOQUIAL = "colloquial" + + # Postal code regions used for mail delivery and routing. + # Examples include US ZIP codes, UK postcodes, and Canadian postal codes. + # Only applicable to functional subtype. + POSTAL = "postal" diff --git a/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/models.py b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/models.py new file mode 100644 index 000000000..51096c94b --- /dev/null +++ b/packages/overture-schema-divisions-theme/src/overture/schema/divisions/geographic_area/models.py @@ -0,0 +1,108 @@ +"""Geography models for Overture Maps divisions theme.""" + +from typing import Annotated, Literal + +from pydantic import ConfigDict, Field, model_validator + +from overture.schema.core import OvertureFeature +from overture.schema.core.cartography import CartographicallyHinted +from overture.schema.core.names import Named, Names +from overture.schema.system.field_constraint import UniqueItemsConstraint +from overture.schema.system.primitive import ( + Geometry, + GeometryType, + GeometryTypeConstraint, + int32, +) +from overture.schema.system.ref import Id, Reference, Relationship + +from ..division.models import Division +from .enums import GeographicAreaClass, GeographicAreaSubtype + + +class GeographicArea( + OvertureFeature[Literal["divisions"], Literal["geographic_area"]], + Named, + CartographicallyHinted, +): + """Geographic area features represent functional or cultural regions that may span across + multiple administrative divisions. + + These regions capture areas defined by shared characteristics, usage patterns, or + cultural identity rather than formal administrative boundaries. + + Examples include postal code regions (functional) or colloquial regions like "East Asia" + or "California Wine Country" (cultural). + """ + + model_config = ConfigDict(title="geographic_area") + + # Core + geometry: Annotated[ + Geometry, + GeometryTypeConstraint(GeometryType.POLYGON, GeometryType.MULTI_POLYGON), + Field( + description="Geography geometry MUST be a Polygon or MultiPolygon as defined by GeoJSON schema. The geometry is constructed from associated divisions or available sources.", + ), + ] + + # Required + + names: Names + subtype: Annotated[ + GeographicAreaSubtype, + Field( + description="""The type of geography feature. + +- functional: Regions defined by functional characteristics or usage patterns (e.g., postal codes, economic zones). + +- cultural: Regions defined by cultural identity, colloquial usage, or shared cultural characteristics (e.g., "East Asia", "California Wine Country").""" + ), + ] + + class_: Annotated[ + GeographicAreaClass, + Field( + alias="class", + description="Classification of the geography feature. Colloquial class is only allowed for cultural subtype. Postal class is only allowed for functional subtype.", + ), + ] + + # Optional + + associated_division_ids: Annotated[ + list[Id] | None, + UniqueItemsConstraint(), + Field( + description="Optional list of division IDs representing the set of divisions that make up this geography region. This property links the geography to the underlying administrative divisions it encompasses or relates to. May be null if the region cannot be precisely mapped to specific administrative divisions.", + min_length=1, + ), + Reference(Relationship.BOUNDARY_OF, Division), + ] = None + + population: Annotated[ + int32 | None, + Field( + description="Optional population represented in the region, if inferable from associated divisions or available sources.", + ge=0, + ), + ] = None + + @model_validator(mode="after") + def validate_class_rules(self) -> "GeographicArea": + """Validate class field rules.""" + # Colloquial class only allowed for cultural subtype + if ( + self.class_ == GeographicAreaClass.COLLOQUIAL + and self.subtype != GeographicAreaSubtype.CULTURAL + ): + raise ValueError("colloquial class is only allowed for cultural subtype") + + # Postal class only allowed for functional subtype + if ( + self.class_ == GeographicAreaClass.POSTAL + and self.subtype != GeographicAreaSubtype.FUNCTIONAL + ): + raise ValueError("postal class is only allowed for functional subtype") + + return self diff --git a/packages/overture-schema-divisions-theme/tests/geographic_area_baseline_schema.json b/packages/overture-schema-divisions-theme/tests/geographic_area_baseline_schema.json new file mode 100644 index 000000000..d5b22bcb2 --- /dev/null +++ b/packages/overture-schema-divisions-theme/tests/geographic_area_baseline_schema.json @@ -0,0 +1,468 @@ +{ + "$defs": { + "CartographicHints": { + "additionalProperties": false, + "description": "Cartographic hints for optimal use of Overture features in map-making.", + "properties": { + "max_zoom": { + "description": "Recommended maximum tile zoom level in which this feature should be displayed.\n\nIt is recommended that the feature be hidden at zoom levels above this value.\n\nZoom levels follow the Slippy Maps convention, documented in the following\nreferences:\n- https://wiki.openstreetmap.org/wiki/Slippy_map_tilenames\n- https://www.maptiler.com/google-maps-coordinates-tile-bounds-projection", + "maximum": 23, + "minimum": 0, + "title": "Max Zoom", + "type": "integer" + }, + "min_zoom": { + "description": "Recommended minimum tile zoom level in which this feature should be displayed.\n\nIt is recommended that the feature be hidden at zoom levels below this value.\n\nZoom levels follow the Slippy Maps convention, documented in the following\nreferences:\n- https://wiki.openstreetmap.org/wiki/Slippy_map_tilenames\n- https://www.maptiler.com/google-maps-coordinates-tile-bounds-projection", + "maximum": 23, + "minimum": 0, + "title": "Min Zoom", + "type": "integer" + }, + "prominence": { + "description": "Subjective scale of feature significance or importance, with 1 being the least, and\n100 being the most, significant.\n\nThis value can be used to help drive decisions about how and when to display a\nfeature, and how to treat it relative to neighboring features.\n\nWhen populated by Overture, this value is derived from various factors including,\nbut not limited to: feature and subtype, population, and capital status.", + "maximum": 100, + "minimum": 1, + "title": "Prominence", + "type": "integer" + }, + "sort_key": { + "description": "Integer indicating the recommended order in which to draw features.\n\nFeatures with a lower number should be drawn \"in front\" of features with a higher\nnumber.", + "maximum": 255, + "minimum": 0, + "title": "Sort Key", + "type": "integer" + } + }, + "title": "CartographicHints", + "type": "object" + }, + "GeographicAreaClass": { + "description": "Classification of the geographic area feature.", + "enum": [ + "colloquial", + "postal" + ], + "title": "GeographicAreaClass", + "type": "string" + }, + "GeographicAreaSubtype": { + "description": "The type of geographic area feature.\n\n- functional: Regions defined by functional characteristics or usage patterns\n (e.g., postal codes, economic zones).\n\n- cultural: Regions defined by cultural identity, colloquial usage, or shared\n cultural characteristics (e.g., \"East Asia\", \"California Wine Country\").", + "enum": [ + "functional", + "cultural" + ], + "title": "GeographicAreaSubtype", + "type": "string" + }, + "NameRule": { + "additionalProperties": false, + "description": "A rule that can be evaluated to determine the name in advanced scenarios.\n\nName rules are used for cases where the primary name is not sufficient; the common name is not\nthe right fit for the use case and another variant is needed; or where the name only applies in\ncertain specific circumstances.\n\nExamples might include:\n- An official, alternate, or short name.\n- A name that only applies to part of a linear path like a road segment (geometric range\n scoping).\n- A name that only applies to the left or right side of a linear path like a road segment (side\n scoping).\n- A name that is only accepted by some political perspectives.", + "properties": { + "between": { + "description": "The linearly-referenced sub-segment of the geometry, specified as a range (pair) of percentage displacements from the start of the geometry, that the containing NameRule applies to.", + "items": { + "maximum": 1.0, + "minimum": 0.0, + "type": "number" + }, + "maxItems": 2, + "minItems": 2, + "title": "Between", + "type": "array" + }, + "language": { + "description": "The language in which the name `value` is specified, if known, as an IETF BCP 47\nlanguage tag.", + "pattern": "^(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3}?)|(?:[A-Za-z]{4,8}))(?:-[A-Za-z]{4})?(?:-[A-Za-z]{2}|[0-9]{3})?(?:-(?:[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(?:-[A-WY-Za-wy-z0-9](?:-[A-Za-z0-9]{2,8})+)*$", + "title": "Language", + "type": "string" + }, + "perspectives": { + "$ref": "#/$defs/Perspectives", + "description": "Political perspectives from which a named feature is viewed." + }, + "side": { + "$ref": "#/$defs/Side", + "description": "The side, either left or right, that the containing NameRule applies to." + }, + "value": { + "description": "The actual name value.", + "minLength": 1, + "pattern": "^(\\S.*)?\\S$", + "title": "Value", + "type": "string" + }, + "variant": { + "$ref": "#/$defs/NameVariant", + "description": "The name variant for this name rule." + } + }, + "required": [ + "value", + "variant" + ], + "title": "NameRule", + "type": "object" + }, + "NameVariant": { + "description": "Name variant used in a `NameRule`.", + "enum": [ + "common", + "official", + "alternate", + "short" + ], + "title": "NameVariant", + "type": "string" + }, + "Names": { + "additionalProperties": false, + "description": "Multilingual names container.", + "properties": { + "common": { + "additionalProperties": false, + "patternProperties": { + "^(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3}?)|(?:[A-Za-z]{4,8}))(?:-[A-Za-z]{4})?(?:-[A-Za-z]{2}|[0-9]{3})?(?:-(?:[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(?:-[A-WY-Za-wy-z0-9](?:-[A-Za-z0-9]{2,8})+)*$": { + "description": "String with no leading/trailing whitespace", + "pattern": "^(\\S.*)?\\S$", + "type": "string" + } + }, + "propertyNames": { + "description": "IETF BCP-47 language tag" + }, + "title": "Common", + "type": "object" + }, + "primary": { + "description": "The most commonly used name.", + "minLength": 1, + "pattern": "^(\\S.*)?\\S$", + "title": "Primary", + "type": "string" + }, + "rules": { + "description": "Rules for names that cannot be specified in the simple common names property. These rules can cover other name variants such as official, alternate, and short; and they can optionally include geometric scoping (linear referencing) and side-of-road scoping for complex cases.", + "items": { + "$ref": "#/$defs/NameRule" + }, + "title": "Rules", + "type": "array" + } + }, + "required": [ + "primary" + ], + "title": "Names", + "type": "object" + }, + "PerspectiveMode": { + "description": "Perspective mode for disputed names.", + "enum": [ + "accepted_by", + "disputed_by" + ], + "title": "PerspectiveMode", + "type": "string" + }, + "Perspectives": { + "additionalProperties": false, + "description": "Political perspectives container.", + "properties": { + "countries": { + "description": "Countries holding the given mode of perspective.", + "items": { + "description": "ISO 3166-1 alpha-2 country code", + "maxLength": 2, + "minLength": 2, + "pattern": "^[A-Z]{2}$", + "type": "string" + }, + "minItems": 1, + "title": "Countries", + "type": "array", + "uniqueItems": true + }, + "mode": { + "$ref": "#/$defs/PerspectiveMode", + "description": "Whether the perspective holder accepts or disputes this name." + } + }, + "required": [ + "mode", + "countries" + ], + "title": "Perspectives", + "type": "object" + }, + "Side": { + "description": "The side, left or right, on which something appears relative to a facing or heading direction\n(*e.g.*, the side of a road relative to the road orientation), or relative to the direction of\ntravel of a person or vehicle.", + "enum": [ + "left", + "right" + ], + "title": "Side", + "type": "string" + }, + "SourceItem": { + "additionalProperties": false, + "description": "Specifies the source of the data used for a feature or one of its properties.", + "properties": { + "between": { + "description": "The linearly-referenced sub-segment of the geometry, specified as a range (pair) of percentage displacements from the start of the geometry, that the containing SourceItem applies to.", + "items": { + "maximum": 1.0, + "minimum": 0.0, + "type": "number" + }, + "maxItems": 2, + "minItems": 2, + "title": "Between", + "type": "array" + }, + "confidence": { + "description": "Confidence value from the source dataset.\n\nThis is a value between 0.0 and 1.0 and is particularly relevant for ML-derived data.", + "maximum": 1.0, + "minimum": 0.0, + "title": "Confidence", + "type": "number" + }, + "dataset": { + "description": "Name of the dataset where the source data can be found.", + "title": "Dataset", + "type": "string" + }, + "license": { + "description": "Source data license name.\n\nThis should be a valid SPDX license identifier when available.\n\nIf omitted, contact the data provider for more license information.", + "pattern": "^(\\S.*)?\\S$", + "title": "License", + "type": "string" + }, + "property": { + "description": "A JSON Pointer identifying the property (field) that this source information applies to.\n\nThe root document value `\"\"` indicates that this source information applies to the\nentire feature, excepting properties (fields) for which a dedicated source information\nrecord exists.\n\nAny other JSON Pointer apart from `\"\"` indicates that this source record provides\ndedicated source information for the property at the path in the JSON Pointer. As an\nexample, the value `\"/names/common/en\"` indicates that the source information applies to\nthe English common name of a named feature, while the value `\"/geometry\"` indicates that\nit applies to the feature geometry.", + "title": "Property", + "type": "string" + }, + "record_id": { + "description": "Identifies the specific record within the source dataset where the source data can\nbe found.\n\nThe format of record identifiers is dataset-specific.", + "title": "Record Id", + "type": "string" + }, + "update_time": { + "description": "Last update time of the source data record.", + "format": "date-time", + "title": "Update Time", + "type": "string" + } + }, + "required": [ + "property", + "dataset" + ], + "title": "SourceItem", + "type": "object" + } + }, + "additionalProperties": false, + "description": "Geographic area features represent functional or cultural regions that may span across\nmultiple administrative divisions.\n\nThese regions capture areas defined by shared characteristics, usage patterns, or\ncultural identity rather than formal administrative boundaries.\n\nExamples include postal code regions (functional) or colloquial regions like \"East Asia\"\nor \"California Wine Country\" (cultural).", + "properties": { + "bbox": { + "description": "An optional bounding box for the feature", + "items": { + "type": "number" + }, + "maxItems": 4, + "minItems": 4, + "title": "Bbox", + "type": "array" + }, + "geometry": { + "description": "Geography geometry MUST be a Polygon or MultiPolygon as defined by GeoJSON schema. The geometry is constructed from associated divisions or available sources.", + "oneOf": [ + { + "properties": { + "bbox": { + "items": { + "type": "number" + }, + "minItems": 4, + "type": "array" + }, + "coordinates": { + "items": { + "items": { + "items": { + "items": { + "type": "number" + }, + "maxItems": 3, + "minItems": 2, + "type": "array" + }, + "minItems": 4, + "type": "array" + }, + "minItems": 1, + "type": "array" + }, + "minItems": 1, + "type": "array" + }, + "type": { + "const": "MultiPolygon", + "type": "string" + } + }, + "required": [ + "type", + "coordinates" + ], + "type": "object" + }, + { + "properties": { + "bbox": { + "items": { + "type": "number" + }, + "minItems": 4, + "type": "array" + }, + "coordinates": { + "items": { + "items": { + "items": { + "type": "number" + }, + "maxItems": 3, + "minItems": 2, + "type": "array" + }, + "minItems": 4, + "type": "array" + }, + "minItems": 1, + "type": "array" + }, + "type": { + "const": "Polygon", + "type": "string" + } + }, + "required": [ + "type", + "coordinates" + ], + "type": "object" + } + ], + "title": "Geometry" + }, + "id": { + "description": "A feature ID. This may be an ID associated with the Global Entity Reference System (GERS) if\u2014and-only-if the feature represents an entity that is part of GERS.", + "minLength": 1, + "pattern": "^\\S+$", + "title": "Id", + "type": "string" + }, + "properties": { + "additionalProperties": false, + "not": { + "required": [ + "id", + "bbox", + "geometry" + ] + }, + "patternProperties": { + "^ext_.*$": { + "description": "Additional top-level properties are allowed if prefixed by `ext_`.\n\nThis feature is a on a deprecation path and will be removed once the schema is\nfully migrated to Pydantic." + } + }, + "properties": { + "associated_division_ids": { + "description": "Optional list of division IDs representing the set of divisions that make up this geography region. This property links the geography to the underlying administrative divisions it encompasses or relates to. May be null if the region cannot be precisely mapped to specific administrative divisions.", + "items": { + "description": "A unique identifier", + "minLength": 1, + "pattern": "^\\S+$", + "type": "string" + }, + "minLength": 1, + "title": "Associated Division Ids", + "type": "array", + "uniqueItems": true + }, + "cartography": { + "$ref": "#/$defs/CartographicHints", + "title": "cartography" + }, + "class": { + "$ref": "#/$defs/GeographicAreaClass", + "description": "Classification of the geography feature. Colloquial class is only allowed for cultural subtype. Postal class is only allowed for functional subtype." + }, + "names": { + "$ref": "#/$defs/Names" + }, + "population": { + "description": "Optional population represented in the region, if inferable from associated divisions or available sources.", + "maximum": 2147483647, + "minimum": 0, + "title": "Population", + "type": "integer" + }, + "sources": { + "description": "Information about the source data used to assemble the feature.", + "items": { + "$ref": "#/$defs/SourceItem" + }, + "minItems": 1, + "title": "Sources", + "type": "array", + "uniqueItems": true + }, + "subtype": { + "$ref": "#/$defs/GeographicAreaSubtype", + "description": "The type of geography feature.\n\n- functional: Regions defined by functional characteristics or usage patterns (e.g., postal codes, economic zones).\n\n- cultural: Regions defined by cultural identity, colloquial usage, or shared cultural characteristics (e.g., \"East Asia\", \"California Wine Country\")." + }, + "theme": { + "const": "divisions", + "title": "Theme", + "type": "string" + }, + "type": { + "const": "geographic_area", + "title": "Type", + "type": "string" + }, + "version": { + "description": "", + "maximum": 2147483647, + "minimum": 0, + "title": "Version", + "type": "integer" + } + }, + "required": [ + "names", + "theme", + "type", + "version", + "subtype", + "class" + ], + "type": "object" + }, + "type": { + "const": "Feature", + "type": "string" + } + }, + "required": [ + "type", + "id", + "geometry", + "properties" + ], + "title": "geographic_area", + "type": "object" +} \ No newline at end of file diff --git a/packages/overture-schema-divisions-theme/tests/test_geographic_area_json_schema_baseline.py b/packages/overture-schema-divisions-theme/tests/test_geographic_area_json_schema_baseline.py new file mode 100644 index 000000000..6687d9faf --- /dev/null +++ b/packages/overture-schema-divisions-theme/tests/test_geographic_area_json_schema_baseline.py @@ -0,0 +1,32 @@ +"""Baseline JSON Schema tests for geographic area type.""" + +import json +import os + +from overture.schema.divisions import GeographicArea +from overture.schema.system.json_schema import json_schema + + +def test_geographic_area_json_schema_baseline() -> None: + """Test that GeographicArea generates consistent JSON Schema (baseline comparison).""" + schema = json_schema(GeographicArea) + + # Path to baseline file + baseline_file = os.path.join( + os.path.dirname(__file__), "geographic_area_baseline_schema.json" + ) + + # If baseline doesn't exist, create it + if not os.path.exists(baseline_file): + with open(baseline_file, "w") as f: + json.dump(schema, f, indent=2, sort_keys=True) + + # Load baseline and compare + with open(baseline_file) as f: + baseline_schema = json.load(f) + + # Compare the generated schema with the baseline + assert schema == baseline_schema, ( + "Generated JSON Schema differs from baseline. " + "If this change is intentional, delete the baseline file to regenerate it." + ) diff --git a/schema/defs.yaml b/schema/defs.yaml index a20de9047..551aa59bd 100644 --- a/schema/defs.yaml +++ b/schema/defs.yaml @@ -90,6 +90,7 @@ description: Common schema definitions shared by all themes - division - division_area - division_boundary + - geographic_area - infrastructure - land - land_cover diff --git a/schema/divisions/geographic_area.yaml b/schema/divisions/geographic_area.yaml new file mode 100644 index 000000000..1b0447a1e --- /dev/null +++ b/schema/divisions/geographic_area.yaml @@ -0,0 +1,99 @@ +--- +"$schema": https://json-schema.org/draft/2020-12/schema +title: geographic_area +description: + Geographic area features represent functional or cultural regions that may + span across multiple administrative divisions. These regions capture + areas defined by shared characteristics, usage patterns, or cultural + identity rather than formal administrative boundaries. + + Examples include postal code regions (functional) or colloquial regions + like "East Asia" or "California Wine Country" (cultural). +type: object +properties: # JSON Schema: Top-level object properties. + id: { "$ref": ../defs.yaml#/$defs/propertyDefinitions/id } + geometry: + description: + Geography geometry MUST be a Polygon or MultiPolygon as defined + by GeoJSON schema. The geometry is constructed from associated + divisions or available sources. + unevaluatedProperties: false + oneOf: + - "$ref": https://geojson.org/schema/Polygon.json + - "$ref": https://geojson.org/schema/MultiPolygon.json + properties: # GeoJSON: top-level object 'properties' property. + unevaluatedProperties: false + required: [names, subtype, class] + allOf: + - "$ref": ../defs.yaml#/$defs/propertyContainers/overtureFeaturePropertiesContainer + - "$ref": ../defs.yaml#/$defs/propertyContainers/namesContainer + - "$ref": ../defs.yaml#/$defs/propertyContainers/cartographyContainer + - if: + properties: + class: { const: colloquial } + then: + properties: + subtype: { const: cultural } + - if: + properties: + class: { const: postal } + then: + properties: + subtype: { const: functional } + properties: # JSON Schema: properties within GeoJSON top-level object 'properties' property + subtype: + description: >- + The type of geography feature. + + functional: Regions defined by functional characteristics or + usage patterns (e.g., postal codes, economic zones). + + cultural: Regions defined by cultural identity, colloquial + usage, or shared cultural characteristics (e.g., "East Asia", + "California Wine Country"). + type: string + enum: + - functional # Region defined by functional characteristics + # or usage patterns. + + - cultural # Region defined by cultural identity or + # colloquial usage. + class: + description: >- + Classification of the geography feature. + + colloquial: Colloquial regions are informal, culturally defined, + or commonly referenced areas that do not correspond to official + administrative boundaries. Unlike countries, states, counties, or + cities—whose boundaries are legally defined—colloquial regions + evolve from cultural, historical, economic, or linguistic identity. + Examples include South Florida, East Asia, and California Wine Country. + Only applicable to cultural subtype. + + postal: Postal code regions used for mail delivery and routing. + Examples include US ZIP codes, UK postcodes, and Canadian postal codes. + Only applicable to functional subtype. + type: string + enum: + - colloquial # Informal, culturally defined regions without + # official administrative boundaries. + + - postal # Postal code regions for mail delivery. + associated_division_ids: + description: + Optional list of division IDs representing the set of divisions + that make up this geography region. This property links the + geography to the underlying administrative divisions it + encompasses or relates to. May be null if the region cannot be + precisely mapped to specific administrative divisions. + type: array + minItems: 1 + uniqueItems: true + items: { "$ref": "../defs.yaml#/$defs/propertyDefinitions/id" } + population: + description: + Optional population represented in the region, if inferable from + associated divisions or available sources. + type: integer + minimum: 0 + wikidata: { "$ref": "../defs.yaml#/$defs/propertyDefinitions/wikidata" } diff --git a/schema/schema.yaml b/schema/schema.yaml index df59e6529..b01e56649 100644 --- a/schema/schema.yaml +++ b/schema/schema.yaml @@ -106,6 +106,14 @@ oneOf: type: { enum: [division_area] } then: { "$ref": divisions/division_area.yaml } else: { propertyNames: false } + - if: + properties: + properties: + properties: + theme: { enum: [divisions] } + type: { enum: [geographic_area] } + then: { "$ref": divisions/geographic_area.yaml } + else: { propertyNames: false } - if: properties: properties: