Files
crm_project/server/app/services/ocr_service.py
T
hankin 423baff73b v0.1.0: CRM/ERP 系统内测版本 - 安全加固完成
- Docker bridge 网络隔离(8000 端口封死)
- Gunicorn 4 Worker 多进程
- Alembic 数据库迁移基线
- 日志轮转 20m×3
- JWT 密钥 + DB 密码 + CORS 收紧
- 3-2-1 备份链路(NAS + R740-B 冷备)
- 连接池 pool_pre_ping + pool_recycle=3600
2026-03-16 07:31:37 +00:00

221 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
OCR 服务 — 基于 3090 节点 Qwen3.5-27B (Vision)
对发票/名片图片做 AI 视觉理解,提取结构化数据。
"""
from __future__ import annotations
import base64
import json
import re
import httpx
from app.core.config import settings
INVOICE_PROMPT = """你是一个专业的发票OCR解析器。请分析图片中的发票/票据,提取以下结构化信息,以 JSON 格式返回:
{
"merchant": "开票方/销售方名称",
"amount": 金额数字(不带货币符号),
"date": "YYYY-MM-DD 格式的开票日期",
"invoice_code": "发票代码(如有)",
"invoice_number": "发票号码(如有)",
"tax_rate": "税率(如有)",
"tax_amount": 税额数字(如有),
"items": "发票上的商品/服务名称",
"buyer": "购买方/抬头(如有)",
"remark": "备注信息(如有)"
}
只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。"""
BUSINESS_CARD_PROMPT = """你是一个名片OCR解析器。请分析图片中的名片,提取以下信息并以 JSON 返回:
{
"name": "姓名",
"company": "公司名称",
"title": "职位",
"phone": "电话号码",
"email": "邮箱",
"address": "地址",
"other": "其他信息"
}
只输出 JSON。无法识别的字段设为 null。"""
async def ocr_image(
image_base64: str,
scene: str = "invoice",
) -> dict:
"""
调用 3090 Qwen-VL 对图片做视觉理解/OCR。
Args:
image_base64: base64 编码的图片数据
scene: "invoice" | "business_card" | "general"
Returns:
{"success": True, "data": {...提取的结构化数据...}}
"""
fallback = {"success": False, "data": {}, "error": "OCR 服务不可用"}
if not settings.OLLAMA_3090_BASE_URL:
return fallback
prompt = INVOICE_PROMPT if scene == "invoice" else (
BUSINESS_CARD_PROMPT if scene == "business_card" else
"请详细描述图片中的所有文字内容,以 JSON 格式输出。"
)
url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
payload = {
"model": settings.OLLAMA_3090_MODEL,
"messages": [
{
"role": "user",
"content": "/no_think\n" + prompt,
"images": [image_base64], # Ollama vision 格式
},
],
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 2000,
},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(url, json=payload)
if resp.status_code != 200:
print(f"[OCR] 3090 返回 {resp.status_code}: {resp.text[:200]}")
return {"success": False, "data": {}, "error": f"VL 模型返回 {resp.status_code}"}
data = resp.json()
# Qwen3.5 的 CoT 推理放在 message.thinking,最终结果在 message.content
content = data.get("message", {}).get("content", "")
thinking = data.get("message", {}).get("thinking", "")
# 优先从 content 提取 JSON,回退到 thinking
for text_source in [content, thinking]:
if not text_source:
continue
cleaned = re.sub(r'<think>.*?</think>', '', text_source, flags=re.DOTALL).strip()
json_match = re.search(r'\{[\s\S]*\}', cleaned)
if json_match:
try:
result = json.loads(json_match.group())
print(f"[OCR] 解析成功: {list(result.keys())}")
return {"success": True, "data": result}
except json.JSONDecodeError:
continue
# 没有提取到 JSON,返回原始文本
raw = content or thinking
print(f"[OCR] 未能提取 JSON, 内容长度: content={len(content)}, thinking={len(thinking)}")
return {"success": True, "data": {"raw_text": raw[:2000]}}
except httpx.TimeoutException:
print("[OCR] 3090 超时(60s")
return {"success": False, "data": {}, "error": "VL 模型响应超时"}
except json.JSONDecodeError as e:
print(f"[OCR] JSON 解析失败: {e}")
return {"success": False, "data": {}, "error": f"JSON 解析失败: {e}"}
except Exception as e:
print(f"[OCR] 错误: {e}")
return {"success": False, "data": {}, "error": str(e)}
TEXT_INVOICE_PROMPT = """你是一个专业的发票数据提取器。以下是一份发票/票据的文本内容(来自 PDF 转换后的 Markdown 或纯文本)。
请从中提取以下结构化信息,以 JSON 格式返回:
{
"merchant": "开票方/销售方名称",
"amount": 金额数字(不带货币符号),
"date": "YYYY-MM-DD 格式的开票日期",
"invoice_code": "发票代码(如有)",
"invoice_number": "发票号码(如有)",
"tax_rate": "税率(如有)",
"tax_amount": 税额数字(如有),
"items": "发票上的商品/服务名称",
"buyer": "购买方/抬头(如有)",
"remark": "备注信息(如有)"
}
只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。
注意:文本可能是从 PDF 转换而来,格式可能不规整,请智能识别。"""
async def extract_invoice_from_text(
text: str,
scene: str = "invoice",
) -> dict:
"""
用 LLM 从纯文本(MD/TXT)中提取发票结构化数据。
不走视觉模型,纯文本理解,更快更准。
"""
fallback = {"success": False, "data": {}, "error": "AI 文本提取服务不可用"}
if not settings.OLLAMA_3090_BASE_URL:
return fallback
prompt = TEXT_INVOICE_PROMPT if scene == "invoice" else (
BUSINESS_CARD_PROMPT if scene == "business_card" else
"请从以下文本中提取所有关键信息,以 JSON 格式输出。"
)
# 限制文本长度,避免 token 爆炸
truncated = text[:8000] if len(text) > 8000 else text
url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
payload = {
"model": settings.OLLAMA_3090_MODEL,
"messages": [
{
"role": "user",
"content": f"/no_think\n{prompt}\n\n--- 以下是发票文本内容 ---\n\n{truncated}",
# 不传 images —— 纯文本模式
},
],
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 2000,
},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(url, json=payload)
if resp.status_code != 200:
print(f"[TextExtract] 3090 返回 {resp.status_code}: {resp.text[:200]}")
return {"success": False, "data": {}, "error": f"LLM 返回 {resp.status_code}"}
data = resp.json()
content = data.get("message", {}).get("content", "")
thinking = data.get("message", {}).get("thinking", "")
for text_source in [content, thinking]:
if not text_source:
continue
cleaned = re.sub(r'<think>.*?</think>', '', text_source, flags=re.DOTALL).strip()
json_match = re.search(r'\{[\s\S]*\}', cleaned)
if json_match:
try:
result = json.loads(json_match.group())
print(f"[TextExtract] AI 提取成功: {list(result.keys())}")
return {"success": True, "data": result}
except json.JSONDecodeError:
continue
raw = content or thinking
print(f"[TextExtract] 未能提取 JSON, 内容: {raw[:200]}")
return {"success": True, "data": {"raw_text": raw[:2000]}}
except httpx.TimeoutException:
print("[TextExtract] 3090 超时")
return {"success": False, "data": {}, "error": "LLM 响应超时"}
except Exception as e:
print(f"[TextExtract] 错误: {e}")
return {"success": False, "data": {}, "error": str(e)}