321 lines
9.8 KiB
Python
321 lines
9.8 KiB
Python
"""
|
||
geo_tools.io.readers
|
||
~~~~~~~~~~~~~~~~~~~~
|
||
统一的矢量数据读取接口,支持:
|
||
- Shapefile (.shp)
|
||
- GeoJSON (.geojson / .json)
|
||
- GeoPackage (.gpkg)
|
||
- File Geodatabase (.gdb) ← 通过 fiona OpenFileGDB / ESRI FileGDB 驱动
|
||
- KML / KMZ
|
||
- FlatGeobuf (.fgb)
|
||
- CSV(含 WKT 或 经纬度列)
|
||
|
||
所有函数均返回 ``geopandas.GeoDataFrame``。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
from pathlib import Path
|
||
from typing import Any, Generator
|
||
|
||
import fiona
|
||
import geopandas as gpd
|
||
|
||
from app.utils.logger import get_logger
|
||
from app.utils.validators import validate_vector_path
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
|
||
# ── 主入口 ─────────────────────────────────────────────────────────────────────
|
||
|
||
def read_vector(
|
||
path: str | Path,
|
||
layer: str | int | None = None,
|
||
crs: str | int | None = None,
|
||
encoding: str = "utf-8",
|
||
rows: int | None = None,
|
||
**kwargs: Any,
|
||
):
|
||
"""统一的矢量数据读取入口,自动识别文件格式。
|
||
|
||
Parameters
|
||
----------
|
||
path:
|
||
数据路径。支持文件或目录(FileGDB ``*.gdb``)。
|
||
layer:
|
||
图层名或索引(多图层格式如 GPKG、GDB 必填;单图层可省略)。
|
||
crs:
|
||
读取后强制重投影到目标 CRS(不传则保留原始 CRS)。
|
||
encoding:
|
||
属性表编码,Shapefile 中文路径常需指定 ``"gbk"``。
|
||
rows:
|
||
限制读取的行数,默认 None(读取全部数据)。
|
||
用于快速预览数据,避免读取大文件的全部内容。
|
||
**kwargs:
|
||
透传给 ``geopandas.read_file`` 的额外参数。
|
||
|
||
Returns
|
||
-------
|
||
gpd.GeoDataFrame
|
||
读取的矢量数据。
|
||
|
||
示例
|
||
-----
|
||
### 全量读取
|
||
gdf = read_vector("data.shp")
|
||
|
||
### 只读取前 5 行数据(预览模式)
|
||
gdf_preview = read_vector("large_data.shp", rows=5)
|
||
print(gdf_preview.head())
|
||
"""
|
||
path, layer = _split_gdb_layer(path)
|
||
path = validate_vector_path(path)
|
||
suffix = path.suffix.lower()
|
||
|
||
logger.info("读取矢量数据:%s(格式:%s,图层:%s)", path, suffix or "目录", layer)
|
||
|
||
if suffix == ".csv":
|
||
return _read_csv_vector(path, crs=crs, **kwargs)
|
||
|
||
read_kwargs: dict[str, Any] = {"encoding": encoding, **kwargs}
|
||
if layer is not None:
|
||
read_kwargs["layer"] = layer
|
||
|
||
try:
|
||
if rows is not None:
|
||
read_kwargs["rows"] = rows
|
||
logger.info("限制读取行数:%d", rows)
|
||
|
||
gdf = gpd.read_file(str(path), **read_kwargs)
|
||
except Exception as exc:
|
||
raise RuntimeError(f"无法读取矢量数据:{exc}") from None
|
||
|
||
if crs is not None:
|
||
logger.debug("重投影到 %s", crs)
|
||
gdf = gdf.to_crs(crs) # type: ignore
|
||
|
||
logger.info("读取完成:共 %d 条要素,CRS=%s", len(gdf), gdf.crs)
|
||
return gdf
|
||
|
||
|
||
# ── GDB 专用 ───────────────────────────────────────────────────────────────────
|
||
|
||
def read_gdb(
|
||
gdb_path: str | Path,
|
||
layer: str | int | None = None,
|
||
crs: str | int | None = None,
|
||
encoding: str = "utf-8",
|
||
**kwargs: Any,
|
||
) -> gpd.GeoDataFrame:
|
||
"""读取 Esri File Geodatabase(.gdb)中的图层。
|
||
|
||
Parameters
|
||
----------
|
||
gdb_path:
|
||
``.gdb`` 目录路径。
|
||
layer:
|
||
图层名称或索引。若不指定且 GDB 仅有一个图层,则自动选取第一层;
|
||
多图层时必须指定。
|
||
crs:
|
||
读取后目标 CRS,``None`` 则保留原始坐标系。
|
||
encoding:
|
||
属性表字段编码。
|
||
"""
|
||
gdb_path = Path(gdb_path)
|
||
if not gdb_path.exists():
|
||
raise FileNotFoundError(f"GDB 路径不存在:{gdb_path}")
|
||
if gdb_path.suffix.lower() != ".gdb":
|
||
raise ValueError(f"期望 .gdb 目录,收到:{gdb_path.suffix!r}")
|
||
|
||
available_layers = list_gdb_layers(gdb_path)
|
||
logger.debug("GDB 可用图层:%s", available_layers)
|
||
|
||
if layer is None:
|
||
if not available_layers:
|
||
raise ValueError(f"GDB 中没有可用图层:{gdb_path}")
|
||
layer = available_layers[0]
|
||
if len(available_layers) > 1:
|
||
logger.warning(
|
||
"GDB 包含多个图层 %s,默认读取第一层 %r。请显式传入 layer=... 以指定图层。",
|
||
available_layers,
|
||
layer,
|
||
)
|
||
|
||
logger.info("读取 GDB 图层:%s >> %s", gdb_path.name, layer)
|
||
gdf = gpd.read_file(str(gdb_path), layer=layer, encoding=encoding, **kwargs)
|
||
|
||
if crs is not None:
|
||
gdf = gdf.to_crs(crs) # type: ignore
|
||
|
||
logger.info("GDB 读取完成:%d 条要素,CRS=%s", len(gdf), gdf.crs)
|
||
return gdf # type: ignore
|
||
|
||
|
||
def list_gdb_layers(gdb_path: str | Path) -> list[str]:
|
||
"""列出 FileGDB 中所有图层名称。
|
||
|
||
Parameters
|
||
----------
|
||
gdb_path:
|
||
``.gdb`` 目录路径。
|
||
|
||
Returns
|
||
-------
|
||
list[str]
|
||
图层名称列表。
|
||
"""
|
||
gdb_path = Path(gdb_path)
|
||
try:
|
||
return fiona.listlayers(str(gdb_path))
|
||
except Exception as exc:
|
||
raise RuntimeError(
|
||
f"无法列出 GDB 图层:{gdb_path}。\n"
|
||
"请确认 fiona 已安装 OpenFileGDB 驱动(通常随 conda/wheels 自带)。\n"
|
||
f"原始错误:{exc}"
|
||
) from exc
|
||
|
||
def _split_gdb_layer(path: str | Path) -> tuple[Path, str | None]:
|
||
"""从完整路径中分离 GDB 数据库路径和图层名。
|
||
|
||
Parameters
|
||
----------
|
||
path:
|
||
完整路径,可以是字符串或 Path 对象。
|
||
|
||
Returns
|
||
-------
|
||
tuple[Path, str | None]
|
||
(gdb_path, layer_name),其中 gdb_path 是 GDB 目录路径,layer_name 是图层名,若没有图层名则为 None。
|
||
"""
|
||
path_obj = Path(path)
|
||
str_path = str(path_obj)
|
||
|
||
# 查找 .gdb 的位置
|
||
gdb_pos = str_path.find('.gdb')
|
||
|
||
if gdb_pos == -1:
|
||
# 如果没有 .gdb,整个路径作为 GDB 路径,没有图层
|
||
return path_obj, None
|
||
|
||
# 提取 GDB 路径(包含 .gdb)
|
||
gdb_path = str_path[:gdb_pos + 4]
|
||
|
||
# 提取图层名(.gdb 之后的部分)
|
||
layer_part = str_path[gdb_pos + 4:]
|
||
# 去除开头的路径分隔符
|
||
layer_name = layer_part.lstrip(os.sep).lstrip('/').lstrip('\\')
|
||
|
||
# 如果没有图层名,返回 None
|
||
if not layer_name:
|
||
layer_name = None
|
||
|
||
return Path(gdb_path), layer_name
|
||
|
||
# ── GPKG 专用 ──────────────────────────────────────────────────────────────────
|
||
|
||
def read_gpkg(
|
||
gpkg_path: str | Path,
|
||
layer: str | int | None = None,
|
||
crs: str | int | None = None,
|
||
**kwargs: Any,
|
||
) -> gpd.GeoDataFrame:
|
||
"""读取 GeoPackage (.gpkg) 文件。
|
||
|
||
Parameters
|
||
----------
|
||
gpkg_path:
|
||
``.gpkg`` 文件路径。
|
||
layer:
|
||
图层名或索引;多图层时必须指定。
|
||
"""
|
||
gpkg_path = Path(gpkg_path)
|
||
if not gpkg_path.exists():
|
||
raise FileNotFoundError(f"GPKG 文件不存在:{gpkg_path}")
|
||
|
||
available = fiona.listlayers(str(gpkg_path))
|
||
if layer is None:
|
||
if not available:
|
||
raise ValueError(f"GPKG 中没有可用图层:{gpkg_path}")
|
||
layer = available[0]
|
||
if len(available) > 1:
|
||
logger.warning(
|
||
"GPKG 包含多个图层 %s,默认读取第一层 %r。", available, layer
|
||
)
|
||
|
||
gdf = gpd.read_file(str(gpkg_path), layer=layer, **kwargs)
|
||
if crs is not None:
|
||
gdf = gdf.to_crs(crs) # type: ignore
|
||
return gdf # type: ignore
|
||
|
||
|
||
def list_gpkg_layers(gpkg_path: str | Path) -> list[str]:
|
||
"""列出 GeoPackage 中所有图层名称。
|
||
|
||
Parameters
|
||
----------
|
||
gpkg_path:
|
||
GeoPackage 文件路径,可以是字符串或 Path 对象。
|
||
|
||
Returns
|
||
-------
|
||
list[str]
|
||
图层名称列表。
|
||
"""
|
||
return fiona.listlayers(str(gpkg_path))
|
||
|
||
|
||
# ── CSV 矢量读取 ────────────────────────────────────────────────────────────────
|
||
|
||
def _read_csv_vector(
|
||
path: Path,
|
||
lon_col: str = "longitude",
|
||
lat_col: str = "latitude",
|
||
wkt_col: str | None = None,
|
||
crs: str | int | None = None,
|
||
**kwargs: Any,
|
||
) -> gpd.GeoDataFrame:
|
||
"""从 CSV 读取空间数据,支持 WKT 列或经纬度列。
|
||
|
||
Parameters
|
||
----------
|
||
path:
|
||
CSV 文件路径。
|
||
lon_col:
|
||
经度列名(WKT 模式时忽略)。
|
||
lat_col:
|
||
纬度列名(WKT 模式时忽略)。
|
||
wkt_col:
|
||
WKT 几何列名;若指定则优先使用。
|
||
"""
|
||
import pandas as pd
|
||
from shapely import wkt as shapely_wkt
|
||
|
||
df = pd.read_csv(path, **kwargs)
|
||
|
||
if wkt_col and wkt_col in df.columns:
|
||
geometry = df[wkt_col].apply(shapely_wkt.loads)
|
||
elif lon_col in df.columns and lat_col in df.columns:
|
||
from shapely.geometry import Point
|
||
geometry = [Point(lon, lat) for lon, lat in zip(df[lon_col], df[lat_col])]
|
||
else:
|
||
raise ValueError(
|
||
f"CSV 中未找到 WKT 列 {wkt_col!r} 或经纬度列 ({lon_col!r}, {lat_col!r})。"
|
||
)
|
||
|
||
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs or "EPSG:4326")
|
||
return gdf
|
||
|
||
|
||
def read_csv_points(
|
||
path: str | Path,
|
||
lon_col: str = "longitude",
|
||
lat_col: str = "latitude",
|
||
crs: str | int = "EPSG:4326",
|
||
**kwargs: Any,
|
||
) -> gpd.GeoDataFrame:
|
||
"""从含经纬度列的 CSV 文件创建点 GeoDataFrame(公开接口)。"""
|
||
path = Path(path)
|
||
return _read_csv_vector(path, lon_col=lon_col, lat_col=lat_col, crs=crs, **kwargs)
|