update: au_sim优化ocr模型,细节调整

main
Leo 6 months ago
parent 1578f3d523
commit 60f3646db7

@ -29,7 +29,7 @@ var (
nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊"} nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊"}
schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语"} schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语"}
amountWhiteList = []string{"200", "100", "未充值"} amountWhiteList = []string{"200", "100", "未充值", "50"}
) )
type ExtractedInfo struct { type ExtractedInfo struct {
@ -65,12 +65,24 @@ var auSimCmd = &cobra.Command{
info := extractInfoFromOCR(ocrText) info := extractInfoFromOCR(ocrText)
//号码里包含了这些敏感数字 //号码里包含了这些敏感数字
if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100")) { if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100") || strings.Contains(info.Phone, "50")) {
newName := "fail/" + file.Name() + ".phone.jpg" if info.Name != "" {
newPath := filepath.Join(imageDir, newName) combined := strings.Join(ocrText, "")
os.Rename(fullPath, newPath) context := getContext(combined, info.Name, 30)
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 号码里包含充值敏感数字,可能会误识别!") if strings.Contains(context, info.Phone) {
continue newName := "fail/" + file.Name() + ".phone.jpg"
newPath := filepath.Join(imageDir, newName)
os.Rename(fullPath, newPath)
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 号码里包含充值敏感数字,可能会误识别!")
continue
}
} else {
newName := "fail/" + file.Name() + ".phone.jpg"
newPath := filepath.Join(imageDir, newName)
os.Rename(fullPath, newPath)
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 号码里包含充值敏感数字,可能会误识别!")
continue
}
} }
if info.Phone != "" && containsBroadband(ocrText) { if info.Phone != "" && containsBroadband(ocrText) {
newName := "broadband/" + file.Name() newName := "broadband/" + file.Name()
@ -136,15 +148,14 @@ func callOCR(b64 string) []string {
} }
// 使用 GJSON 提取所有文本字段 // 使用 GJSON 提取所有文本字段
var texts []string var texts []string
textResults := gjson.Get(bodyStr, "data.0") textResults := gjson.Get(bodyStr, "data")
if !textResults.Exists() {
if !textResults.Exists() || !textResults.IsArray() {
panic(exception.New(fmt.Errorf("no OCR results found").Error())) panic(exception.New(fmt.Errorf("no OCR results found").Error()))
} }
textResults.ForEach(func(_, line gjson.Result) bool {
if line.IsArray() && line.Array()[1].IsArray() { textResults.ForEach(func(_, value gjson.Result) bool {
text := line.Array()[1].Array()[0].String() texts = append(texts, value.String())
texts = append(texts, text)
}
return true return true
}) })
return texts return texts
@ -154,6 +165,13 @@ func extractInfoFromOCR(texts []string) ExtractedInfo {
combined := strings.Join(texts, "") combined := strings.Join(texts, "")
info := ExtractedInfo{} info := ExtractedInfo{}
// 提取手机号
re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势).*?(\d{11})`)
match := re.FindStringSubmatch(combined)
if len(match) == 2 {
info.Phone = match[1]
}
// 提取业务员姓名 // 提取业务员姓名
for _, name := range nameWhiteList { for _, name := range nameWhiteList {
if strings.Contains(combined, name) { if strings.Contains(combined, name) {
@ -179,12 +197,22 @@ func extractInfoFromOCR(texts []string) ExtractedInfo {
break break
} }
} }
//如果到这一步业务员的名字还没找到,那么就放弃提取业务员名字上下文去找学校,改成直接全文去找学校
// 提取手机号 if info.Name == "" {
re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势).*?(\d{11})`) for _, school := range schoolWhiteList {
match := re.FindStringSubmatch(combined) if strings.Contains(combined, school) {
if len(match) == 2 { info.School = school
info.Phone = match[1] break
}
}
}
//如果到这一步充值金额还没找到那么可以以关键字去寻找充值金额适用于充值100充值200充值50这类关键词
if info.Amount == "" {
re := regexp.MustCompile(`充值[:]?\s*(100|200|50)\b`)
match := re.FindStringSubmatch(combined)
if len(match) == 2 {
info.Amount = match[1]
}
} }
return info return info

Loading…
Cancel
Save