批量识别身份证并导出excel

骑着悟空看八戒 发表于 2025-11-1 08:44:21

版本更新第五版, 版本号1.0.5
1.0.4更新内容: 模型改为随程序分发, 初次启动时无需再下载模型, 解决黑框终端无法关闭的问题
1.0.5更新内容: 优化字段匹配方法, 识别精确度有所提高
版本更新第三版, 代码已开源(见附件)
更新内容: 优化软件启动速度, 增加识别字段, 增加图片重命名(可选可配置), 增加excel内图片选项(路径/图片)
由于代码量较大, 无法在此处展开, 有对源码或进一步优化感兴趣的伙伴可以下载源码阅读修改
下面的代码我只放核心逻辑部分
由于一些行业需要手动录入大量的身份证信息, 因此编写本软件用于减少工作
软件说明: 本软件使用python3.12.10编写, 因此无法在win10以下的电脑上运行，gui改为pySide6，识别功能依赖包：paddlepaddle==2.6.2，paddleocr，paddlenlp，调用cpu进行识别，不依赖显卡。如果你的电脑显卡比较好，可以将依赖改为GPU版，再进行打包，使用gpu性能会更强。
代码提供Windows版本，由于ocr模型在本地运行，因此软件对电脑性能要求较高，我Linux云服务器则是2核2G，带不动。Windows电脑一般不会有那么低的配置，则不用担心。可断网进行识别。
代码如下：

# 在gui.py文件中添加以下代码
import re

from PySide6.QtCore import QThread, Signal
import traceback
from openpyxl.workbook import Workbook
from openpyxl.drawing.image import Image as XLImage
from openpyxl.styles import Alignment

import cv2
import numpy as np
from PIL import Image

'''
ocr实际开始工作的线程
需要将前边加载好的模型传递过来
'''
class OCRWorker(QThread):

定义信号，用于通知主线程处理进度和结果

progress_updated = Signal(int, int)# 当前进度，总数量
finished_signal = Signal()# 处理完成信号
error_occurred = Signal(str)# 错误信息信号

def __init__(self, file_paths, export_options, ocr):
super().__init__()
self.file_paths = file_paths
self.export_options = export_options
self.ocr = ocr
self._should_terminate = False# 添加终止标志

def run(self):
try:
   # 处理所有文件
   self.process_files(self.file_paths)

except Exception as e:
   error_msg = f"处理过程中发生错误: {str(e)}\n{traceback.format_exc()}"
   self.error_occurred.emit(error_msg)

def process_files(self, file_paths):
wb = Workbook()
ws = wb.active
ws.append(["图片", "姓名", "性别", "民族", "出生日期", "住址", "身份证号", "有效期限"])
row_idx = 2

total_files = len(self.file_paths)
processed_count = 0
for i, path in enumerate(file_paths):

   # 检查是否收到终止请求
   if self._should_terminate:
         print("收到终止请求，正在保存已处理的数据...")
         break
   # 发送进度更新信号
   self.progress_updated.emit(i + 1, total_files)
   info = self.extract_info_from_image(path)
   # 检查线程是否被中断
   if self.isInterruptionRequested():
         break
   if info:
         ws.cell(row=row_idx, column=2, value=info["姓名"])
         ws.cell(row=row_idx, column=3, value=info["性别"])
         ws.cell(row=row_idx, column=4, value=info["民族"])
         ws.cell(row=row_idx, column=5, value=info["出生日期"])
         ws.cell(row=row_idx, column=6, value=info["住址"])
         ws.cell(row=row_idx, column=7, value=info["身份证号"])
         ws.cell(row=row_idx, column=8, value=info["有效期限"])

         # 根据导出选项决定如何处理图片
         export_option = self.export_options.get("export_option", "image_path")

         # 处理重命名（如果配置了重命名选项）
         if self.should_rename_file(info):
            new_path = self.rename_file(path, info)
            # 更新图片路径为重命名后的路径
            if export_option == "image_path":
               ws.cell(row=row_idx, column=1, value=new_path)
            # 更新返回数据中的图片路径
            info["图片路径"] = new_path
         if export_option == "image_file":
            # 直接嵌入图片文件
            try:
               img = XLImage(info["图片路径"])
               img.width = 500
               img.height = 300
               ws.row_dimensions.height = img.height
               ws.add_image(img, f"A{row_idx}")
               ws.column_dimensions['A'].width = img.width * 0.14
            except Exception as e:
               print(f"无法插入图片 {path}: {e}")
         else :
            # 仅保存图片路径
            ws.cell(row=row_idx, column=1, value=info["图片路径"])

         for col in range(1, 9):
            cell = ws.cell(row=row_idx, column=col)
            cell.alignment = Alignment(horizontal='center', vertical='center')

         row_idx += 1
         processed_count += 1

for col in range(1, 9):
   header_cell = ws.cell(row=1, column=col)
   header_cell.alignment = Alignment(horizontal='center', vertical='center')

output_path = "身份证识别结果.xlsx"
wb.save(output_path)

if self._should_terminate:
   print(f"处理已终止，已完成 {processed_count}/{total_files} 个文件，结果已保存到 {output_path}")
else:
   print(f"处理完成，共处理 {processed_count} 个文件，结果已保存到 {output_path}")
# 发送完成信号
self.finished_signal.emit()

def extract_info_from_image(self, image_path):
"""从图片中提取信息（优化版文本处理）"""
try:
   # 检查文件是否存在和可读
   # import os
   # if not os.path.exists(image_path):
   # raise FileNotFoundError(f"图片文件不存在: {image_path}")
   #
   # if not os.access(image_path, os.R_OK):
   # raise PermissionError(f"没有权限读取图片文件: {image_path}")
   # # 检查是否需要预处理身份证图片
   # if self.export_options.get("preprocess_id_card", True):
   # processed_image_path = self.preprocess_id_card_image(image_path)
   # else:
   # processed_image_path = image_path
   #
   # result = self.ocr.ocr(processed_image_path, cls=True)

   result = self.ocr.ocr(image_path, cls=True)

   # 1. 先整体拼接所有文本
   all_text = ""
   for res in result:
         for line in res:
            text = line
            if text:
               all_text += text

   # 2. 去除"中华人民共和国居民身份证"标题
   all_text = re.sub(r'中华人民共和国居民身份证', '', all_text)

   # 3. 去除所有空格和特殊空白字符
   all_text = re.sub(r'\s+', '', all_text)

   # 4. 在关键字段前添加换行符
   keywords = ['姓名', '性别', '民族', '出生', '住址', '公民身份号码', '签发机关', '有效期限']
   for keyword in keywords:
         all_text = re.sub(f'({keyword})', r'\n\1', all_text)

   print(f"处理后的文本: {all_text}")

   # 初始化提取结果
   name = gender = nation = birth = address = id_number = expire = ""

   # 提取各字段信息

   # 提取身份证号
   # 直接匹配17位数字+1位校验码（数字或X）
   id_match = re.search(r'[\d]{17}[\dXx]', all_text)
   if id_match:
         id_number = id_match.group().strip()

         # 移除身份证号码干扰
         all_text = all_text.replace(id_match.group(), '')

   # 提取姓名
   name_match = re.search(r'姓名(.+?)(?=\n|$)', all_text)
   if name_match:
         name = name_match.group(1).strip()

   # 提取性别
   gender_match = re.search(r'性别(男|女)', all_text)
   if gender_match:
         gender = gender_match.group(1).strip()

   # 提取民族
   nation_match = re.search(r'民族(.+?)(?=\n|$)', all_text)
   if nation_match:
         nation = nation_match.group(1).strip()

   # 提取出生日期
   birth_match = re.search(r'出生(.+?)(?=\n|$)', all_text)
   if birth_match:
         birth = birth_match.group(1).strip()

   # 提取住址
   address_match = re.search(r'住址(.+?)(?=\n|$)', all_text)
   if address_match:
         address = address_match.group(1).strip()

   # 提取有效期限
   expire_match = re.search(r'有效期限(.+?)(?=\n|$)', all_text)
   if expire_match:
         expire = expire_match.group(1).strip()

   data = {
         "姓名": name,
         "性别": gender,
         "民族": nation,
         "出生日期": birth,
         "住址": address,
         "身份证号": id_number,
         "有效期限": expire,
         "图片路径": image_path
   }
   print(f"data == {data}")

   return data

except Exception as e:
   print(f"处理 {image_path} 失败: {e}")
   return None

def should_rename_file(self, info):
"""检查是否需要重命名文件"""
rename_options = self.export_options.get("rename_options", [])
return len(rename_options) > 0

def rename_file(self, original_path, info):
"""根据配置重命名文件"""
if not self.should_rename_file(info):
   return original_path

rename_options = self.export_options.get("rename_options", [])
separator = self.export_options.get("separator", "_")

# 构建新的文件名部分
name_parts = []

for option in rename_options:
   if option == "name" and info.get("姓名"):
         name_parts.append(info["姓名"])
   elif option == "id" and info.get("身份证号"):
         name_parts.append(info["身份证号"])
   elif option == "nation" and info.get("民族"):
         name_parts.append(info["民族"])
   elif option == "sex" and info.get("性别"):
         name_parts.append(info["性别"])
   elif option == "address" and info.get("住址"):
         name_parts.append(info["住址"])

if not name_parts:
   return original_path

# 构造新文件名
new_name = separator.join(name_parts)

# 保持原始文件扩展名
import os
dir_name = os.path.dirname(original_path)
file_ext = os.path.splitext(original_path)
new_path = os.path.join(dir_name, new_name + file_ext)

# 重命名文件
try:
   os.rename(original_path, new_path)
   return new_path
except Exception as e:
   print(f"重命名文件失败 {original_path} -> {new_path}: {e}")
   return original_path

# 图片灰度处理, 处理成扫描件, 下面还没写好不要用
# def preprocess_id_card_image(self, image_path):
# """对身份证图片进行校正、裁剪并转换为黑白扫描件"""
# try:
#       # 读取图片
#       img = cv2.imread(image_path)
#       if img is None:
#          return image_path
#
#       # 1. 转换为灰度图
#       gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#
#       # 2. 使用中值滤波代替高斯模糊
#       denoised = cv2.medianBlur(gray, 3)
#
#       # 3. 使用自适应阈值
#       binary = cv2.adaptiveThreshold(
#          denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
#          cv2.THRESH_BINARY, 15, 3
#       )
#
#       # 4. 可选：轻微平滑处理
#       smoothed = cv2.medianBlur(binary, 1)
#
#       # 5. 保存处理后的图片
#       import os
#       dir_name = os.path.dirname(image_path)
#       file_name = os.path.splitext(os.path.basename(image_path))
#       file_ext = os.path.splitext(image_path)
#       processed_path = os.path.join(dir_name, f"{file_name}_processed{file_ext}")
#
#       cv2.imwrite(processed_path, smoothed)
#
#       return processed_path
# except Exception as e:
#       print(f"身份证图片预处理失败 {image_path}: {e}")
#       return image_path
#
# def order_points(self, pts):
# """对四个点进行排序：左上、右上、右下、左下"""
# rect = np.zeros((4, 2), dtype="float32")
#
# # 计算坐标和
# s = pts.sum(axis=1)
# rect = pts# 左上角点（坐标和最小）
# rect = pts# 右下角点（坐标和最大）
#
# # 计算坐标差
# diff = np.diff(pts, axis=1)
# rect = pts# 右上角点（坐标差最小）
# rect = pts# 左下角点（坐标差最大）
#
# return rect
#
# def four_point_transform(self, image, pts):
# """四点透视变换"""
# # 获取排序后的坐标
# rect = self.order_points(pts)
# (tl, tr, br, bl) = rect
#
# # 计算新图像的宽度和高度
# width_a = np.sqrt(((br - bl) ** 2) + ((br - bl) ** 2))
# width_b = np.sqrt(((tr - tl) ** 2) + ((tr - tl) ** 2))
# max_width = max(int(width_a), int(width_b))
#
# height_a = np.sqrt(((tr - br) ** 2) + ((tr - br) ** 2))
# height_b = np.sqrt(((tl - bl) ** 2) + ((tl - bl) ** 2))
# max_height = max(int(height_a), int(height_b))
#
# # 目标点
# dst = np.array([
#       ,
#       ,
#       ,
#       ], dtype="float32")
#
# # 计算透视变换矩阵并应用
# M = cv2.getPerspectiveTransform(rect, dst)
# warped = cv2.warpPerspective(image, M, (max_width, max_height))
#
# return warped

# 中断处理, 此处不要直接中断线程, 可能导致excel未能处理完毕线程就退出了
# 我们应该保证excel
def request_termination(self):
"""请求终止处理过程"""
self._should_terminate = True

寒哥Gh61ac8 发表于 2025-11-1 08:45:17

更新第四版:
更新内容: 1. 该版本模型随程序一起分发, 即使是初次运行也无需再联网下载模型了, 本地化更加完全2.隐藏控制台黑窗口, 更美观一些
3.但是该版本程序所在文件夹不能包含中文路径了, 否则将无法识别模型
通过网盘分享的文件：id_card_ocr.zip
链接: https://pan.baidu.com/s/1amlQldVsmuggp4HLcYeq2g?pwd=c94k 提取码: c94k
--来自百度网盘超级会员v9的分享

寒哥Gh61ac8 发表于 2025-11-1 08:46:04

身份证有效期也需要识别！

huoxianghui913 发表于 2025-11-1 08:46:56

感谢楼主分享

风之影赫 发表于 2025-11-1 08:47:49

谢谢分享，小旅馆登记需要

风之影赫 发表于 2025-11-1 08:48:12

感谢分享，试一下

风之影赫 发表于 2025-11-1 08:48:50

谢谢分享，很实用

寒哥Gh61ac8 发表于 2025-11-1 08:49:47

谢谢分享，正反都能识别吗？

huoxianghui913 发表于 2025-11-1 08:49:55

感谢分享，坐等离线版

风之影赫 发表于 2025-11-1 08:50:55

这个只有第一次下载模型需要联网, 后面再用就是离线版的了

页: [1] 2

ACGac's Archiver

批量识别身份证并导出excel