refactor: 重构项目结构，将geo_tools重命名为app并更新相关引用

- 将主包名从geo_tools改为app - 更新所有模块中的引用路径 - 迁移并更新测试用例 - 添加项目规则文档 - 保持原有功能不变，仅进行结构调整
2026-04-12 19:49:56 +08:00
parent fcb8e1f255
commit db51d41aef
41 changed files with 4132 additions and 808 deletions
--- a/scripts/其他工具/样点剔除统计表格.py
+++ b/scripts/其他工具/样点剔除统计表格.py
@@ -0,0 +1,404 @@
+import pandas as pd
+import numpy as np
+import os
+import geopandas as gpd
+from openpyxl import Workbook
+from openpyxl.styles import Alignment, Font, Border, Side
+from openpyxl.utils import get_column_letter
+
+# 定义指标代码与单位的对应关系
+INDICATOR_UNITS = {
+    # 基本指标
+    'PH': ('pH', '-'),
+    'ECA': ('交换性钙', 'cmol(½Ca²⁺)/kg'),
+    'EMG': ('交换性镁', 'cmol(½Mg²⁺)/kg'),
+    'TN': ('全氮', 'g/kg'),
+    'TP': ('全磷', 'g/kg'),
+    'TK': ('全钾', 'g/kg'),
+    'AS1': ('有效硫', 'mg/kg'),
+    'AB': ('有效硼', 'mg/kg'),
+    'AP': ('有效磷', 'mg/kg'),
+    'AFE': ('有效铁', 'mg/kg'),
+    'ACU': ('有效铜', 'mg/kg'),
+    'AZN': ('有效锌', 'mg/kg'),
+    'AMN': ('有效锰', 'mg/kg'),
+    'OM': ('有机质', 'g/kg'),
+    'GZCHD': ('耕层厚度', 'cm'),
+    'AK': ('速效钾', 'mg/kg'),
+    'CEC': ('阳离子交换量', 'cmol/kg'),
+    # 特殊指标 - 根据文件名对应字段
+    'FL': ('粉粒', '%'),
+    'NL': ('黏粒', '%'),
+    'SL': ('砂粒', '%'),
+    'TRRZPJZ': ('土壤容重', 'g/cm³'),
+    'TRZD': ('土壤质地', '分类'),
+    # 其他可能指标
+    'AMO': ('有效钼', 'mg/kg'),
+    'TSE': ('全硒', 'mg/kg'),
+    'YXTCHD': ('有效土层厚度', 'cm')
+}
+
+# 文件名到字段的映射
+FILENAME_TO_FIELD = {
+    '粉粒': 'FL',
+    '黏粒': 'NL',
+    '砂粒': 'SL',
+    '表层容重': 'TRRZPJZ',
+    '土壤质地十二级分类': 'TRZD',
+    '双江县YXTCHD': 'YXTCHD'
+}
+
+# 扩展字段别名映射，支持更多pH字段名
+FIELD_ALIASES = {
+    'PH': ['pH', 'PH', 'ph'],  # 支持pH的各种大小写形式
+    'ECA': ['交换性钙', 'ECA'],
+    'EMG': ['交换性镁', 'EMG'],
+    'TN': ['全氮', 'TN'],
+    'TP': ['全磷', 'TP'],
+    'TK': ['全钾', 'TK'],
+    'AS1': ['有效硫', 'AS1'],
+    'AB': ['有效硼', 'AB'],
+    'AP': ['有效磷', 'AP'],
+    'AFE': ['有效铁', 'AFE'],
+    'ACU': ['有效铜', 'ACU'],
+    'AZN': ['有效锌', 'AZN'],
+    'AMN': ['有效锰', 'AMN'],
+    'OM': ['有机质', 'OM'],
+    'GZCHD': ['耕层厚度', 'GZCHD'],
+    'AK': ['速效钾', 'AK'],
+    'CEC': ['阳离子交换量', 'CEC'],
+    'FL': ['粉粒', 'FL'],
+    'NL': ['黏粒', 'NL'],
+    'SL': ['砂粒', 'SL'],
+    'TRRZPJZ': ['土壤容重', 'TRRZPJZ'],
+    'TRZD': ['土壤质地', 'TRZD'],
+    'AMO': ['有效钼', 'AMO'],
+    'TSE': ['全硒', 'TSE'],
+    'YXTCHD': ['有效土层厚度', 'YXTCHD']
+}
+
+
+def find_shapefiles(folder_path):
+    """在文件夹中递归查找所有的Shapefile文件"""
+    shapefiles = []
+
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            if file.lower().endswith('.shp'):
+                shapefiles.append(os.path.join(root, file))
+
+    return shapefiles
+
+
+def read_shapefile_data(shapefile_path):
+    """读取Shapefile数据并返回属性表"""
+    try:
+        print(f"  读取Shapefile: {os.path.basename(shapefile_path)}")
+        gdf = gpd.read_file(shapefile_path, encoding='utf-8')
+
+        print(f"    要素数量: {len(gdf)}")
+        print(f"    属性字段: {list(gdf.columns)}")
+
+        return gdf
+    except Exception as e:
+        print(f"  读取Shapefile失败: {e}")
+        try:
+            gdf = gpd.read_file(shapefile_path, encoding='gbk')
+            print(f"    使用GBK编码成功读取")
+            return gdf
+        except:
+            return None
+
+
+def get_indicator_data(gdf, filename):
+    """从GeoDataFrame中获取指标数据，使用统一字段匹配逻辑"""
+    indicator_data = {}
+
+    basename = os.path.basename(filename).replace('.shp', '')
+
+    # 1. 首先尝试文件名映射
+    target_field = None
+    if basename in FILENAME_TO_FIELD:
+        target_field = FILENAME_TO_FIELD[basename]
+        if target_field in gdf.columns:
+            indicator_data[target_field] = gdf[target_field]
+            print(f"    通过文件名映射找到字段: {target_field}")
+        else:
+            # 尝试通过别名查找
+            for indicator_code in INDICATOR_UNITS.keys():
+                if target_field == indicator_code:
+                    for alias in FIELD_ALIASES.get(indicator_code, []):
+                        if alias in gdf.columns:
+                            indicator_data[indicator_code] = gdf[alias]
+                            print(f"    通过文件名映射+别名找到字段: {alias} -> {indicator_code}")
+                            break
+
+    # 2. 如果没有通过文件名找到，尝试直接匹配所有指标和别名
+    if not indicator_data:
+        for indicator_code in INDICATOR_UNITS.keys():
+            # 先尝试直接匹配指标代码
+            if indicator_code in gdf.columns:
+                indicator_data[indicator_code] = gdf[indicator_code]
+                print(f"    直接匹配字段: {indicator_code}")
+                continue
+
+            # 再尝试匹配别名
+            aliases = FIELD_ALIASES.get(indicator_code, [])
+            for alias in aliases:
+                if alias in gdf.columns:
+                    indicator_data[indicator_code] = gdf[alias]
+                    print(f"    通过别名匹配: {alias} -> {indicator_code}")
+                    break
+
+    # 3. 额外检查：如果文件名包含特定关键词，尝试匹配
+    if not indicator_data:
+        filename_lower = basename.lower()
+        for indicator_code, (chinese_name, unit) in INDICATOR_UNITS.items():
+            if indicator_code.lower() in filename_lower or chinese_name in filename_lower:
+                # 尝试匹配指标代码或中文名
+                if indicator_code in gdf.columns:
+                    indicator_data[indicator_code] = gdf[indicator_code]
+                    print(f"    通过文件名关键词匹配: {indicator_code}")
+                    break
+                elif chinese_name in gdf.columns:
+                    indicator_data[indicator_code] = gdf[chinese_name]
+                    print(f"    通过文件名关键词匹配中文名: {chinese_name} -> {indicator_code}")
+                    break
+
+    return indicator_data
+
+
+def get_combined_stats_from_folder(folder_path, folder_name="数据"):
+    """从文件夹中所有shapefile合并统计指定指标"""
+    shapefiles = find_shapefiles(folder_path)
+
+    if not shapefiles:
+        print(f"  未找到Shapefile文件")
+        return pd.DataFrame()
+
+    print(f"  找到 {len(shapefiles)} 个Shapefile文件")
+
+    all_data = {code: [] for code in INDICATOR_UNITS.keys()}
+
+    for i, shp_file in enumerate(shapefiles, 1):
+        print(f"\n  [{i}] 处理文件: {os.path.basename(shp_file)}")
+        gdf = read_shapefile_data(shp_file)
+
+        if gdf is not None:
+            indicator_data = get_indicator_data(gdf, shp_file)
+
+            for indicator_code, data_series in indicator_data.items():
+                if indicator_code in all_data:
+                    # 转换为数值类型，处理可能的非数值数据
+                    try:
+                        data_series = pd.to_numeric(data_series, errors='coerce')
+                        valid_data = data_series.dropna()
+                        if len(valid_data) > 0:
+                            all_data[indicator_code].extend(valid_data.tolist())
+                            print(f"    提取 {indicator_code}: {len(valid_data)} 个值")
+                    except Exception as e:
+                        print(f"    处理 {indicator_code} 数据时出错: {e}")
+
+    # 计算每个指标的合并统计
+    stats_list = []
+
+    for indicator_code, (chinese_name, unit) in INDICATOR_UNITS.items():
+        data_list = all_data.get(indicator_code, [])
+        if not data_list:
+            continue
+
+        data_series = pd.Series(data_list)
+        # 过滤极端值（可选，根据实际需求调整）
+        data_series = data_series[(data_series >= 0) | pd.isna(data_series)]
+
+        if len(data_series) == 0:
+            continue
+
+        # 关键修复1：计算总体标准差（ddof=0），而不是默认的样本标准差（ddof=1）
+        std_dev = data_series.std(ddof=0)
+        mean_val = data_series.mean()
+
+        # 关键修复2：优化变异系数计算
+        if abs(mean_val) < 1e-8:  # 均值接近0时
+            cv_value = 0.0
+        else:
+            # CV = (标准差 / 均值) * 100，保留2位小数
+            cv_value = round((std_dev / mean_val) * 100, 2)
+
+        stats = {
+            '指标代码': indicator_code,
+            '指标': chinese_name,
+            '单位': unit,
+            '样点数': int(len(data_series)),
+            'Min': round(float(data_series.min()), 2),
+            'Max': round(float(data_series.max()), 2),
+            'Mean': round(float(mean_val), 2),
+            'Std': round(float(std_dev), 2),  # 使用总体标准差
+            'CV': cv_value
+        }
+        stats_list.append(stats)
+        print(f"    统计 {chinese_name}({indicator_code}): {len(data_series)} 个样点")
+
+    if stats_list:
+        stats_df = pd.DataFrame(stats_list)
+        stats_df = stats_df.sort_values('指标')
+        print(f"\n  总共统计到 {len(stats_df)} 个指标")
+        return stats_df
+
+    print("  未找到任何指标数据")
+    return pd.DataFrame()
+
+
+def create_statistics_excel(before_folder, after_folder, output_path):
+    """创建融合的统计表格，在剔除后表格前加一列剔除前样点数和剔除样点数"""
+    workbook = Workbook()
+
+    # 移除默认sheet
+    if 'Sheet' in workbook.sheetnames:
+        default_sheet = workbook['Sheet']
+        workbook.remove(default_sheet)
+
+    # 定义样式
+    thin_border = Border(
+        left=Side(style='thin'),
+        right=Side(style='thin'),
+        top=Side(style='thin'),
+        bottom=Side(style='thin')
+    )
+
+    print("=" * 60)
+    print("开始分析样点数据")
+    print("=" * 60)
+
+    # 分析剔除前数据
+    before_stats = None
+    if os.path.exists(before_folder):
+        print(f"\n[1] 分析剔除前数据:")
+        print(f"文件夹路径: {before_folder}")
+        before_stats = get_combined_stats_from_folder(before_folder, "剔除前")
+        if not before_stats.empty:
+            print(f"✓ 剔除前统计完成: {len(before_stats)} 个指标")
+        else:
+            print("✗ 剔除前未找到指定指标数据")
+            before_stats = None
+    else:
+        print(f"✗ 剔除前文件夹不存在: {before_folder}")
+        before_stats = None
+
+    # 分析剔除后数据
+    if os.path.exists(after_folder):
+        print(f"\n[2] 分析剔除后数据:")
+        print(f"文件夹路径: {after_folder}")
+        after_stats = get_combined_stats_from_folder(after_folder, "剔除后")
+
+        if not after_stats.empty:
+            # 创建融合的统计工作表
+            sheet_combined = workbook.create_sheet(title="样点统计")
+
+            # 新的表头：指标, 单位, 剔除前样点数, 剔除样点数, 剔除后样点数, Min, Max, Mean, Std, CV
+            combined_headers = ['指标', '单位', '剔除前样点数', '剔除样点数', '剔除后样点数', 'Min', 'Max', 'Mean',
+                                'Std', 'CV']
+            for col, header in enumerate(combined_headers, 1):
+                cell = sheet_combined.cell(row=1, column=col, value=header)
+                cell.alignment = Alignment(horizontal='center', vertical='center')
+                cell.font = Font(bold=True)
+                cell.border = thin_border
+
+            # 写入数据
+            for row_idx, (index, after_row) in enumerate(after_stats.iterrows(), start=2):
+                # 查找对应的剔除前数据
+                before_sample_count = 0
+                before_row = None
+                if before_stats is not None:
+                    # 首先尝试通过指标代码匹配
+                    matching_rows = before_stats[before_stats['指标代码'] == after_row['指标代码']]
+                    if not matching_rows.empty:
+                        before_row = matching_rows.iloc[0]
+                    else:
+                        # 如果指标代码匹配失败，尝试通过指标名称匹配
+                        matching_rows = before_stats[before_stats['指标'] == after_row['指标']]
+                        if not matching_rows.empty:
+                            before_row = matching_rows.iloc[0]
+
+                if before_row is not None:
+                    before_sample_count = int(before_row['样点数'])
+
+                # 计算剔除样点数
+                after_sample_count = int(after_row['样点数'])
+                abnormal_count = max(0, before_sample_count - after_sample_count)
+
+                # 写入数据
+                sheet_combined.cell(row=row_idx, column=1, value=after_row['指标'])  # 指标
+                sheet_combined.cell(row=row_idx, column=2, value=after_row['单位'])  # 单位
+                sheet_combined.cell(row=row_idx, column=3, value=before_sample_count)  # 剔除前样点数
+                sheet_combined.cell(row=row_idx, column=4, value=abnormal_count)  # 剔除样点数
+                sheet_combined.cell(row=row_idx, column=5, value=after_sample_count)  # 剔除后样点数
+                sheet_combined.cell(row=row_idx, column=6, value=after_row['Min'])  # Min
+                sheet_combined.cell(row=row_idx, column=7, value=after_row['Max'])  # Max
+                sheet_combined.cell(row=row_idx, column=8, value=after_row['Mean'])  # Mean
+                sheet_combined.cell(row=row_idx, column=9, value=after_row['Std'])  # Std
+                sheet_combined.cell(row=row_idx, column=10, value=after_row['CV'])  # CV
+
+                # 设置所有单元格的样式
+                for col_idx in range(1, 11):
+                    cell = sheet_combined.cell(row=row_idx, column=col_idx)
+                    cell.alignment = Alignment(horizontal='center', vertical='center')
+                    cell.border = thin_border
+
+                # 如果剔除了样点，高亮显示剔除样点数列
+                if abnormal_count > 0:
+                    cell = sheet_combined.cell(row=row_idx, column=4)  # 剔除样点数列
+                    cell.font = Font(bold=True, color="FF0000")  # 红色加粗
+
+            # 调整列宽
+            combined_column_widths = {
+                '指标': 15,
+                '单位': 12,
+                '剔除前样点数': 12,
+                '剔除样点数': 12,
+                '剔除后样点数': 12,
+                'Min': 10,
+                'Max': 10,
+                'Mean': 10,
+                'Std': 10,
+                'CV': 10
+            }
+
+            for col_idx, col_name in enumerate(combined_headers, 1):
+                column_letter = get_column_letter(col_idx)
+                if col_name in combined_column_widths:
+                    sheet_combined.column_dimensions[column_letter].width = combined_column_widths[col_name]
+
+            print(f"\n✓ 融合统计完成: {len(after_stats)} 个指标")
+
+            # 输出匹配信息
+            if before_stats is not None:
+                print(f"  剔除前找到 {len(before_stats)} 个指标")
+                print(f"  剔除后找到 {len(after_stats)} 个指标")
+                print(
+                    f"  成功匹配 {len([i for i in range(2, len(after_stats) + 2) if sheet_combined.cell(row=i, column=3).value > 0])} 个指标的剔除前数据")
+        else:
+            print("✗ 剔除后未找到指定指标数据")
+            sheet_combined = workbook.create_sheet(title="样点统计")
+            sheet_combined.cell(row=1, column=1, value="未找到指定指标数据")
+    else:
+        print(f"✗ 剔除后文件夹不存在: {after_folder}")
+        sheet_combined = workbook.create_sheet(title="样点统计")
+        sheet_combined.cell(row=1, column=1, value="剔除后文件夹不存在")
+
+    # 保存文件
+    workbook.save(output_path)
+    print(f"\n" + "=" * 60)
+    print(f"文件保存成功: {output_path}")
+    print("=" * 60)
+
+
+# ================ 使用示例 ================
+if __name__ == "__main__":
+    # 方式1: 处理单个样点数据对
+    before_folder = r"D:\a陆平\1.5实验数据\12月新版实验室数据\云南省实验室数据成果1218\永仁县20260127"
+    after_folder = r"D:\a陆平\1.5实验数据\12月新版实验室数据\云南省实验室数据成果1218\永仁县20260127剔除后"
+    output_path = r"D:\a陆平\1.5实验数据\12月新版实验室数据\云南省实验室数据成果1218\永仁县样点统计结果.xlsx"
+
+    # 执行
+    create_statistics_excel(before_folder, after_folder, output_path)