|
|
|
|
@ -24,11 +24,11 @@ func init() {
|
|
|
|
|
|
|
|
|
|
var (
|
|
|
|
|
imageDir = "/Users/leo/Documents/au-ocr/image/"
|
|
|
|
|
ocrAPIURL = "http://172.16.5.160:8000/ocr/predict-by-base64/"
|
|
|
|
|
ocrAPIURL = "http://127.0.0.1:5005/ocr"
|
|
|
|
|
outputExcelPath = "/Users/leo/Documents/au-ocr/excel/"
|
|
|
|
|
|
|
|
|
|
nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊"}
|
|
|
|
|
schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语"}
|
|
|
|
|
nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊", "军豪"}
|
|
|
|
|
schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语", "浙大紫金港"}
|
|
|
|
|
amountWhiteList = []string{"200", "100", "未充值", "50"}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@ -134,7 +134,7 @@ func callOCR(b64 string) []string {
|
|
|
|
|
resp, err := client.R().
|
|
|
|
|
SetHeader("Content-Type", "application/json").
|
|
|
|
|
SetBody(map[string]string{
|
|
|
|
|
"base64_str": b64,
|
|
|
|
|
"image": b64,
|
|
|
|
|
}).
|
|
|
|
|
Post(ocrAPIURL)
|
|
|
|
|
|
|
|
|
|
@ -143,19 +143,23 @@ func callOCR(b64 string) []string {
|
|
|
|
|
}
|
|
|
|
|
bodyStr := string(resp.Body())
|
|
|
|
|
// 快速检查 resultcode 是否为 200
|
|
|
|
|
if gjson.Get(bodyStr, "resultcode").Int() != 200 {
|
|
|
|
|
|
|
|
|
|
if resp.StatusCode() != 200 {
|
|
|
|
|
panic(exception.New(fmt.Errorf("OCR failed: %s", gjson.Get(bodyStr, "message").String()).Error()))
|
|
|
|
|
}
|
|
|
|
|
// 使用 GJSON 提取所有文本字段
|
|
|
|
|
var texts []string
|
|
|
|
|
textResults := gjson.Get(bodyStr, "data")
|
|
|
|
|
textResults := gjson.Get(bodyStr, "results")
|
|
|
|
|
|
|
|
|
|
if !textResults.Exists() || !textResults.IsArray() {
|
|
|
|
|
panic(exception.New(fmt.Errorf("no OCR results found").Error()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
textResults.ForEach(func(_, value gjson.Result) bool {
|
|
|
|
|
texts = append(texts, value.String())
|
|
|
|
|
text := value.Get("text").String()
|
|
|
|
|
if text != "" {
|
|
|
|
|
texts = append(texts, text)
|
|
|
|
|
}
|
|
|
|
|
return true
|
|
|
|
|
})
|
|
|
|
|
return texts
|
|
|
|
|
@ -197,21 +201,50 @@ func extractInfoFromOCR(texts []string) ExtractedInfo {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
//如果到这一步业务员的名字还没找到,那么就放弃提取业务员名字上下文去找学校,改成直接全文去找学校
|
|
|
|
|
//如果到这一步业务员的名字还没找到,那么就放弃提取业务员名字上下文去找学校,改成直接全文去找学校,然后从学校获取上下文找首充
|
|
|
|
|
if info.Name == "" {
|
|
|
|
|
for _, school := range schoolWhiteList {
|
|
|
|
|
if strings.Contains(combined, school) {
|
|
|
|
|
info.School = school
|
|
|
|
|
context := getContext(combined, school, 30)
|
|
|
|
|
// 从上下文中查找金额
|
|
|
|
|
for _, amount := range amountWhiteList {
|
|
|
|
|
if strings.Contains(context, amount) {
|
|
|
|
|
info.Amount = amount
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
//如果到这一步业务员名字找到了,业务员上下文没有学校,那么还是直接全文去找学校
|
|
|
|
|
if info.Name != "" && info.School == "" {
|
|
|
|
|
for _, school := range schoolWhiteList {
|
|
|
|
|
if strings.Contains(combined, school) {
|
|
|
|
|
info.School = school
|
|
|
|
|
//如果到这里首充还是空白的,试着在学校上下文找首充
|
|
|
|
|
if info.Amount == "" {
|
|
|
|
|
context := getContext(combined, school, 30)
|
|
|
|
|
// 从上下文中查找金额
|
|
|
|
|
for _, amount := range amountWhiteList {
|
|
|
|
|
if strings.Contains(context, amount) {
|
|
|
|
|
info.Amount = amount
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//如果到这一步充值金额还没找到,那么可以以关键字去寻找充值金额,适用于充值100,充值200,充值50这类关键词
|
|
|
|
|
if info.Amount == "" {
|
|
|
|
|
re := regexp.MustCompile(`充值[::]?\s*(100|200|50)\b`)
|
|
|
|
|
match := re.FindStringSubmatch(combined)
|
|
|
|
|
if len(match) == 2 {
|
|
|
|
|
info.Amount = match[1]
|
|
|
|
|
re2 := regexp.MustCompile(`充值[::]?\s*(100|200|50)\b`)
|
|
|
|
|
match2 := re2.FindStringSubmatch(combined)
|
|
|
|
|
if len(match2) == 2 {
|
|
|
|
|
info.Amount = match2[1]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -246,6 +279,9 @@ func writeExcel(data []ExtractedInfo, filename string) {
|
|
|
|
|
if d.School == "外国语" {
|
|
|
|
|
d.School = "浙外"
|
|
|
|
|
}
|
|
|
|
|
if d.School == "浙大紫金港" {
|
|
|
|
|
d.School = "浙大"
|
|
|
|
|
}
|
|
|
|
|
if d.Amount == "未充值" {
|
|
|
|
|
d.Amount = "0"
|
|
|
|
|
}
|
|
|
|
|
|