"""analyze_geo_extent — geographic extent of a cloud of coordinates (EDA `geospatial`). Pure function: no I/O, no network, deterministic. Given two parallel lists of latitudes and longitudes it derives the bounding box, centroid, diagonal span (haversine), per-region counts and hemisphere split of the points, and assigns each point to a country/region via an OFFLINE lookup against a table of rectangular bounding boxes embedded as a constant (`_REGION_BBOXES`). It never reads files, never hits the network and depends only on `math`. The country boxes are deliberately coarse rectangles (a KISS approximation, NOT a reverse-geocoder). Reading is defensive throughout and the function NEVER raises: invalid pairs (None / NaN / out of range) are silently discarded and an empty cloud yields a zeroed result the caller can skip. """ import math # Earth mean radius in km used by the haversine formula. _EARTH_RADIUS_KM = 6371.0 # How many distinct regions to surface in `by_region` before collapsing the # remainder into a single "Otros" bucket. _TOP_REGIONS = 8 # Offline region lookup: (name, lat_min, lat_max, lon_min, lon_max). # # Specific countries are listed FIRST and continental fallbacks LAST: each point # is assigned to the FIRST box that contains it, so the more specific country box # wins over the broad continent box. Boxes are coarse rectangles approximating # the mainland extent of each region; overlapping neighbours are ordered so the # narrower/more-western country claims its coastal points (e.g. Portugal before # Spain, Chile before Argentina, the contiguous US before Canada). _REGION_BBOXES = ( # --- countries (specific) --- ("Portugal", 36.9, 42.2, -9.6, -6.2), ("España", 36.0, 43.8, -9.4, 3.4), ("Francia", 41.3, 51.1, -5.2, 9.6), ("Reino Unido", 49.9, 58.7, -8.6, 1.8), ("Irlanda", 51.4, 55.4, -10.6, -5.9), ("Países Bajos", 50.7, 53.6, 3.3, 7.2), ("Bélgica", 49.5, 51.5, 2.5, 6.4), ("Suiza", 45.8, 47.8, 5.9, 10.5), ("Alemania", 47.3, 55.1, 5.9, 15.0), ("Italia", 36.6, 47.1, 6.6, 18.5), ("Marruecos", 27.7, 35.9, -13.2, -1.0), ("Egipto", 22.0, 31.7, 25.0, 35.0), ("Sudáfrica", -34.8, -22.1, 16.5, 32.9), ("China", 18.0, 53.6, 73.5, 135.1), ("Japón", 24.0, 45.6, 122.9, 145.9), ("India", 6.7, 35.5, 68.1, 97.4), ("Australia", -43.7, -10.0, 112.9, 153.7), ("México", 14.5, 32.7, -118.4, -86.7), ("Estados Unidos", 24.4, 49.4, -125.0, -66.9), ("Canadá", 41.7, 83.1, -141.0, -52.6), ("Chile", -55.9, -17.5, -75.6, -66.4), ("Argentina", -55.1, -21.8, -73.6, -53.6), ("Brasil", -33.8, 5.3, -74.0, -34.8), ("Rusia", 41.2, 77.0, 19.6, 180.0), # --- continental fallbacks (broad) --- ("Europa", 34.0, 72.0, -25.0, 45.0), ("África", -35.0, 37.5, -18.0, 52.0), ("Asia", 5.0, 78.0, 26.0, 180.0), ("América del Norte", 7.0, 84.0, -168.0, -52.0), ("América del Sur", -56.0, 13.0, -82.0, -34.0), ("Oceanía", -50.0, 0.0, 110.0, 180.0), ) def _coord(value, limit): """Coerce a coordinate to a valid float in [-limit, limit] or None. bool is a subclass of int but never a real coordinate, so True/False are treated as missing. NaN and out-of-range values are rejected. """ if value is None or isinstance(value, bool): return None try: f = float(value) except (TypeError, ValueError): return None # NaN is the only value that is not equal to itself. if f != f or f < -limit or f > limit: return None return f def _haversine_km(lat1, lon1, lat2, lon2): """Great-circle distance in km between two (lat, lon) points in degrees.""" rlat1, rlat2 = math.radians(lat1), math.radians(lat2) dlat = math.radians(lat2 - lat1) dlon = math.radians(lon2 - lon1) a = math.sin(dlat / 2.0) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2.0) ** 2 return 2.0 * _EARTH_RADIUS_KM * math.asin(min(1.0, math.sqrt(a))) def _region_of(lat, lon): """Return the name of the first embedded box containing (lat, lon).""" for name, lat_min, lat_max, lon_min, lon_max in _REGION_BBOXES: if lat_min <= lat <= lat_max and lon_min <= lon <= lon_max: return name return "Océano/Otros" def _empty_result(): """Result shape when there are no valid coordinate pairs.""" return { "n_points": 0, "bbox": None, "centroid": None, "span_km": 0.0, "by_region": [], "hemisphere": {"north": 0, "south": 0, "east": 0, "west": 0}, "note": "sin coordenadas validas", } def analyze_geo_extent(lats: list, lons: list) -> dict: """Summarise the geographic extent of a cloud of lat/lon coordinates. Pairs `lats[i]` with `lons[i]` by index (over the common length when the two lists differ in size), discards any pair where either value is None / NaN or outside [-90, 90] (lat) / [-180, 180] (lon), and derives the bounding box, centroid, diagonal span, per-region counts and hemisphere split. Each valid point is matched to a country/region by an offline lookup against coarse rectangular bounding boxes (`_REGION_BBOXES`). Args: lats: List of latitudes in degrees ([-90, 90]); read defensively. lons: List of longitudes in degrees ([-180, 180]); read defensively. Paired with `lats` by index; the shorter length wins when they differ. Returns: Dict with the geographic summary: {n_points, bbox={lat_min,lat_max,lon_min,lon_max}, centroid={lat,lon}, span_km (haversine of the SW->NE bbox diagonal), by_region=[{region,count}] (descending, top-8 with the rest folded into "Otros"), hemisphere={north,south,east,west}, note (Spanish summary phrase)}. With no valid pairs returns the zeroed shape: n_points 0, bbox None, centroid None, span_km 0.0, empty by_region, zeroed hemisphere and the note "sin coordenadas validas". Never raises. """ if not isinstance(lats, (list, tuple)) or not isinstance(lons, (list, tuple)): return _empty_result() valid = [] # zip already stops at the shorter list -> unbalanced lengths are handled. for raw_lat, raw_lon in zip(lats, lons): lat = _coord(raw_lat, 90.0) lon = _coord(raw_lon, 180.0) if lat is None or lon is None: continue valid.append((lat, lon)) if not valid: return _empty_result() n = len(valid) lat_vals = [p[0] for p in valid] lon_vals = [p[1] for p in valid] lat_min, lat_max = min(lat_vals), max(lat_vals) lon_min, lon_max = min(lon_vals), max(lon_vals) centroid_lat = sum(lat_vals) / n centroid_lon = sum(lon_vals) / n # Diagonal span: SW corner (lat_min, lon_min) to NE corner (lat_max, lon_max). span_km = _haversine_km(lat_min, lon_min, lat_max, lon_max) # Hemisphere split: the equator/prime-meridian go to north/east respectively. north = sum(1 for lat in lat_vals if lat >= 0.0) south = n - north east = sum(1 for lon in lon_vals if lon >= 0.0) west = n - east # Count points per region (offline bbox lookup). counts = {} for lat, lon in valid: region = _region_of(lat, lon) counts[region] = counts.get(region, 0) + 1 # Descending by count, then by name for a deterministic tie-break. ranked = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])) by_region = [{"region": name, "count": count} for name, count in ranked[:_TOP_REGIONS]] rest = sum(count for _, count in ranked[_TOP_REGIONS:]) if rest > 0: by_region.append({"region": "Otros", "count": rest}) top_region, top_count = ranked[0] note = ( "los puntos se concentran en {region} ({count} de {n})".format( region=top_region, count=top_count, n=n ) ) return { "n_points": n, "bbox": { "lat_min": lat_min, "lat_max": lat_max, "lon_min": lon_min, "lon_max": lon_max, }, "centroid": {"lat": centroid_lat, "lon": centroid_lon}, "span_km": span_km, "by_region": by_region, "hemisphere": {"north": north, "south": south, "east": east, "west": west}, "note": note, }