Files
crm_project/server/app/services/ocr_service.py
T
hankin 815cbf9d8c v0.2.0: CRM/ERP 系统升级 - 清理 .gitignore 并移除误提交的 venv/env/db 文件
- 更新 .gitignore:全面覆盖环境变量、数据库、日志、缓存、上传文件
- 移除误跟踪的 server/venv/、crm_data.db、.env 文件
- 新增 server/.env.example 模板
- 新增合同管理、利润核算、AI教练等功能模块
- 新增 Playwright e2e 测试套件
- 前后端多项功能升级和 bug 修复
2026-05-11 07:24:19 +00:00

215 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
OCR 服务 — 基于 3090 节点 Qwen3.5-27B (Vision)
对发票/名片图片做 AI 视觉理解,提取结构化数据。
"""
from __future__ import annotations
import base64
import json
import re
import httpx
from app.core.config import settings
INVOICE_PROMPT = """你是一个专业的发票OCR解析器。请分析图片中的发票/票据,提取以下结构化信息,以 JSON 格式返回:
{
"merchant": "开票方/销售方名称",
"amount": 金额数字(不带货币符号),
"date": "YYYY-MM-DD 格式的开票日期",
"invoice_code": "发票代码(如有)",
"invoice_number": "发票号码(如有)",
"tax_rate": "税率(如有)",
"tax_amount": 税额数字(如有),
"items": "发票上的商品/服务名称",
"buyer": "购买方/抬头(如有)",
"remark": "备注信息(如有)"
}
只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。"""
BUSINESS_CARD_PROMPT = """你是一个名片OCR解析器。请分析图片中的名片,提取以下信息并以 JSON 返回:
{
"name": "姓名",
"company": "公司名称",
"title": "职位",
"phone": "电话号码",
"email": "邮箱",
"address": "地址",
"other": "其他信息"
}
只输出 JSON。无法识别的字段设为 null。"""
async def ocr_image(
image_base64: str,
scene: str = "invoice",
) -> dict:
"""
调用 3090 Qwen-VL 对图片做视觉理解/OCR。
Args:
image_base64: base64 编码的图片数据
scene: "invoice" | "business_card" | "general"
Returns:
{"success": True, "data": {...提取的结构化数据...}}
"""
fallback = {"success": False, "data": {}, "error": "OCR 服务不可用"}
if not settings.OLLAMA_3090_BASE_URL:
return fallback
prompt = INVOICE_PROMPT if scene == "invoice" else (
BUSINESS_CARD_PROMPT if scene == "business_card" else
"请详细描述图片中的所有文字内容,以 JSON 格式输出。"
)
url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
payload = {
"model": settings.OLLAMA_3090_MODEL,
"messages": [
{
"role": "user",
"content": prompt,
"images": [image_base64], # Ollama vision 格式
},
],
"stream": False,
"think": False, # 关闭思考模式:稳定输出、避免死循环、提速 2-5x
"options": {
"temperature": 0.1,
"num_predict": 2000,
},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(url, json=payload)
if resp.status_code != 200:
detail = resp.text[:200]
print(f"[OCR] 3090 返回 {resp.status_code}: {detail}")
if "model runner" in detail:
return {"success": False, "data": {}, "error": "AI OCR 模型进程崩溃,请联系管理员重启 Ollama 服务"}
return {"success": False, "data": {}, "error": f"AI OCR 服务异常 (HTTP {resp.status_code}),请稍后重试"}
data = resp.json()
content = data.get("message", {}).get("content", "")
# 关闭思考模式后,结果直接在 content(无 thinking 字段)
if content:
cleaned = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
json_match = re.search(r'\{[\s\S]*\}', cleaned)
if json_match:
try:
result = json.loads(json_match.group())
print(f"[OCR] 解析成功: {list(result.keys())}")
return {"success": True, "data": result}
except json.JSONDecodeError:
pass
print(f"[OCR] 未能提取 JSON, content 长度: {len(content)}")
return {"success": True, "data": {"raw_text": content[:2000]}}
except httpx.TimeoutException:
print("[OCR] 3090 超时(120s")
return {"success": False, "data": {}, "error": "AI OCR 响应超时(120s),模型可能负载过高,请稍后重试"}
except json.JSONDecodeError as e:
print(f"[OCR] JSON 解析失败: {e}")
return {"success": False, "data": {}, "error": f"JSON 解析失败: {e}"}
except Exception as e:
print(f"[OCR] 错误: {e}")
return {"success": False, "data": {}, "error": str(e)}
TEXT_INVOICE_PROMPT = """你是一个专业的发票数据提取器。以下是一份发票/票据的文本内容(来自 PDF 转换后的 Markdown 或纯文本)。
请从中提取以下结构化信息,以 JSON 格式返回:
{
"merchant": "开票方/销售方名称",
"amount": 金额数字(不带货币符号),
"date": "YYYY-MM-DD 格式的开票日期",
"invoice_code": "发票代码(如有)",
"invoice_number": "发票号码(如有)",
"tax_rate": "税率(如有)",
"tax_amount": 税额数字(如有),
"items": "发票上的商品/服务名称",
"buyer": "购买方/抬头(如有)",
"remark": "备注信息(如有)"
}
只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。
注意:文本可能是从 PDF 转换而来,格式可能不规整,请智能识别。"""
async def extract_invoice_from_text(
text: str,
scene: str = "invoice",
) -> dict:
"""
用 LLM 从纯文本(MD/TXT)中提取发票结构化数据。
不走视觉模型,纯文本理解,更快更准。
"""
fallback = {"success": False, "data": {}, "error": "AI 文本提取服务不可用"}
if not settings.OLLAMA_3090_BASE_URL:
return fallback
prompt = TEXT_INVOICE_PROMPT if scene == "invoice" else (
BUSINESS_CARD_PROMPT if scene == "business_card" else
"请从以下文本中提取所有关键信息,以 JSON 格式输出。"
)
# 限制文本长度,避免 token 爆炸
truncated = text[:8000] if len(text) > 8000 else text
url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
payload = {
"model": settings.OLLAMA_3090_MODEL,
"messages": [
{
"role": "user",
"content": f"{prompt}\n\n--- 以下是发票文本内容 ---\n\n{truncated}",
},
],
"stream": False,
"think": False, # 关闭思考模式
"options": {
"temperature": 0.1,
"num_predict": 2000,
},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(url, json=payload)
if resp.status_code != 200:
print(f"[TextExtract] 3090 返回 {resp.status_code}: {resp.text[:200]}")
return {"success": False, "data": {}, "error": f"LLM 返回 {resp.status_code}"}
data = resp.json()
content = data.get("message", {}).get("content", "")
if content:
cleaned = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
json_match = re.search(r'\{[\s\S]*\}', cleaned)
if json_match:
try:
result = json.loads(json_match.group())
print(f"[TextExtract] AI 提取成功: {list(result.keys())}")
return {"success": True, "data": result}
except json.JSONDecodeError:
pass
print(f"[TextExtract] 未能提取 JSON, content: {content[:200]}")
return {"success": True, "data": {"raw_text": content[:2000]}}
except httpx.TimeoutException:
print("[TextExtract] 3090 超时")
return {"success": False, "data": {}, "error": "LLM 响应超时"}
except Exception as e:
print(f"[TextExtract] 错误: {e}")
return {"success": False, "data": {}, "error": str(e)}