package cmd import ( "encoding/base64" "fmt" "github.com/go-resty/resty/v2" "github.com/mizuki1412/go-core-kit/class/exception" "github.com/mizuki1412/go-core-kit/init/initkit" "github.com/mizuki1412/go-core-kit/service/logkit" "github.com/spf13/cobra" "github.com/tidwall/gjson" "github.com/xuri/excelize/v2" "io/ioutil" "os" "path/filepath" "regexp" "strings" "time" ) func init() { rootCmd.AddCommand(auSimCmd) } var ( apiKey = "oEr6D60u6mmJRPlAHEIx8dWN" secretKey = "r7fYVHExzsqlYO9P2kgdvs5N1WVJL8vB" imageDir = "/Users/leo/Documents/au-ocr/image/" ocrAPIURL = "http://127.0.0.1:5005/ocr" outputExcelPath = "/Users/leo/Documents/au-ocr/excel/" nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊", "军豪", "王皓", "丁磊", "吴喻飞", "秦俊杰", "俊杰", "刘继伟", "佳朴", "王浩", "游雨婷", "娅慧", "李浪", "奇乐", "张雨珈", "婧婧", "乞慧利", "郭婧婧", "王鑫", "兴凯"} schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语", "浙大紫金港", "师范", "杭职", "杭师大", "树人", "成院", "城院", "师大", "开放", "中美院", "美院", "同济", "杭科"} amountWhiteList = []string{"200", "100", "未充值", "50"} ) type ExtractedInfo struct { Name string School string Phone string Amount string } var auSimCmd = &cobra.Command{ Use: "au", Short: "Batch processing operations of the autumn semester sim card", Run: func(cmd *cobra.Command, args []string) { initkit.BindFlags(cmd) files, err := ioutil.ReadDir(imageDir) if err != nil { panic(exception.New(err.Error())) } tokenBaidu := getBaiduAccessToken() results := []ExtractedInfo{} for _, file := range files { if file.IsDir() || !strings.HasSuffix(file.Name(), ".jpg") || strings.Contains(file.Name(), "thumb.jpg") || strings.Contains(file.Name(), "hd.jpg") { continue } fullPath := filepath.Join(imageDir, file.Name()) base64Str, err := imageToBase64(fullPath) if err != nil { panic(exception.New("转base64失败:" + err.Error())) } //第一次调用本地的 ocrText := callOCR(base64Str) logkit.Info("成功调用本地OCR") if err != nil { panic(exception.New("OCR请求失败:" + err.Error())) } info := extractInfoFromOCR(ocrText, "") //只要返回结果中存在空值的,就再调用另外的接口 if info.Name == "" || info.Phone == "" || info.School == "" || info.Amount == "" { ocrTextBaidu := callOCRBaidu(base64Str, tokenBaidu) logkit.Info("¥成功调用百度OCR") infoBaidu := extractInfoFromOCR(ocrTextBaidu, "") //合并 if info.Name == "" { info.Name = infoBaidu.Name } if info.Phone == "" { info.Phone = infoBaidu.Phone } if info.School == "" { info.School = infoBaidu.School } if info.Amount == "" { info.Amount = infoBaidu.Amount } } //号码里包含了充值的这些敏感数字,就把号码从combined删除了重新再提取一次 //TODO 本地ocr识别不到 baidu能识别到? 把infoBaidu变量移出去 if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100") || strings.Contains(info.Phone, "50")) { infoRedo := extractInfoFromOCR(ocrText, info.Phone) info.Amount = infoRedo.Amount } if info.Phone != "" && containsBroadband(ocrText) { newName := "broadband/" + file.Name() newPath := filepath.Join(imageDir, newName) os.Rename(fullPath, newPath) info.Name = info.Name + "+宽带" logkit.Info(info.Phone + " √√√加宽带!√√√") } oldPath := fullPath newName := "" //条件放宽到识别出号码就算成功,剩下不成功的 打?手动 if info.Phone != "" { logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别成功!") newName = "success/" + fmt.Sprintf("%s%s.jpg", info.Name, info.Phone) results = append(results, info) } else { logkit.Info("××× 业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别失败!") newName = "fail/" + file.Name() + ".error.jpg" } newPath := filepath.Join(imageDir, newName) os.Rename(oldPath, newPath) } writeExcel(results, outputExcelPath+"/"+time.Now().Format("20060102150405")+".xlsx") }, } func containsBroadband(texts []string) bool { for _, text := range texts { if strings.Contains(text, "宽带") { return true } } return false } func imageToBase64(path string) (string, error) { data, err := os.ReadFile(path) if err != nil { return "", err } return base64.StdEncoding.EncodeToString(data), nil } func callOCR(b64 string) []string { client := resty.New() resp, err := client.R(). SetHeader("Content-Type", "application/json"). SetBody(map[string]string{ "image": b64, }). Post(ocrAPIURL) if err != nil { panic(exception.New(err.Error())) } bodyStr := string(resp.Body()) // 快速检查 resultcode 是否为 200 if resp.StatusCode() != 200 { panic(exception.New(fmt.Errorf("OCR failed: %s", gjson.Get(bodyStr, "message").String()).Error())) } // 使用 GJSON 提取所有文本字段 var texts []string textResults := gjson.Get(bodyStr, "results") if !textResults.Exists() || !textResults.IsArray() { panic(exception.New(fmt.Errorf("no OCR results found").Error())) } textResults.ForEach(func(_, value gjson.Result) bool { text := value.Get("text").String() if text != "" { texts = append(texts, text) } return true }) return texts } func extractInfoFromOCR(texts []string, delPhone string) ExtractedInfo { combined := strings.Join(texts, "") if delPhone != "" { combined = strings.Replace(combined, delPhone, "", -1) } info := ExtractedInfo{} // 提取手机号 re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势|XH).*?(\d{11})`) match := re.FindStringSubmatch(combined) if len(match) >= 2 { info.Phone = match[1] } // 提取业务员姓名 for _, name := range nameWhiteList { if strings.Contains(combined, name) { info.Name = name context := getContext(combined, name, 30) logkit.Info("【上下文 " + context + "】") // 从上下文中查找学校 for _, school := range schoolWhiteList { if strings.Contains(context, school) { info.School = school break } } // 从上下文中查找金额 for _, amount := range amountWhiteList { if strings.Contains(context, amount) { info.Amount = amount break } } break } } //如果到这一步业务员的名字还没找到,那么就放弃提取业务员名字上下文去找学校,改成直接全文去找学校,然后从学校获取上下文找首充 if info.Name == "" { for _, school := range schoolWhiteList { if strings.Contains(combined, school) { info.School = school context := getContext(combined, school, 30) // 从上下文中查找金额 for _, amount := range amountWhiteList { if strings.Contains(context, amount) { info.Amount = amount break } } break } } } //如果到这一步业务员名字找到了,业务员上下文没有学校,那么还是直接全文去找学校 if info.Name != "" && info.School == "" { for _, school := range schoolWhiteList { if strings.Contains(combined, school) { info.School = school //如果到这里首充还是空白的,试着在学校上下文找首充 if info.Amount == "" { context := getContext(combined, school, 30) // 从上下文中查找金额 for _, amount := range amountWhiteList { if strings.Contains(context, amount) { info.Amount = amount break } } } break } } } //如果到这一步充值金额还没找到,那么可以以关键字去寻找充值金额,适用于充值100,充值200,充值50这类关键词 if info.Amount == "" { re2 := regexp.MustCompile(`充值[::]?\s*(100|200|50)\b`) match2 := re2.FindStringSubmatch(combined) if len(match2) >= 2 { info.Amount = match2[1] } } //到这里要判断一下是不是压根没充值,还是没找到再去全文正则匹配纯100、200、50 if info.Amount == "" { if strings.Contains(combined, "未充值") { info.Amount = "未充值" } } //如果到这一步充值金额还没找到,再通过正则去找纯100、200、50 /* "100", // 匹配 "200", // 匹配 "50", // 匹配 "1003", // 不匹配 "5004", // 不匹配 "2004", // 不匹配 "充值200", // 匹配 "充值2004", // 不匹配 "abc100xyz", // 匹配 */ if info.Amount == "" { re3 := regexp.MustCompile(`100|200|50`) allMatches := re3.FindAllStringIndex(combined, -1) for _, loc := range allMatches { start, end := loc[0], loc[1] beforeOK := start == 0 || (combined[start-1] < '0' || combined[start-1] > '9') afterOK := end == len(combined) || (combined[end] < '0' || combined[end] > '9') if beforeOK && afterOK { info.Amount = combined[start:end] break } } } return info } func getContext(text, keyword string, length int) string { index := strings.Index(text, keyword) if index == -1 { return "" } start := index - length if start < 0 { start = 0 } end := index + len(keyword) + length if end > len(text) { end = len(text) } return text[start:end] } func writeExcel(data []ExtractedInfo, filename string) { f := excelize.NewFile() sheet := "Sheet1" f.SetSheetRow(sheet, "A1", &[]string{"学校", "业务员姓名", "手机号码", "充值金额"}) for i, d := range data { if d.School == "万象" { d.School = "万向" } if d.School == "外国语" { d.School = "浙外" } if d.School == "浙大紫金港" { d.School = "浙大" } if d.School == "师范" { d.School = "杭师" } if d.School == "师大" { d.School = "杭师" } if d.School == "杭师大" { d.School = "杭师" } if d.School == "成院" { d.School = "城院" } if d.School == "中美院" { d.School = "美院" } if d.Amount == "未充值" { d.Amount = "0" } if d.Amount == "" { d.Amount = "?" } if d.Name == "" { d.Name = "?" } if d.School == "" { d.School = "?" } row := []string{d.School, d.Name, d.Phone, d.Amount} cell, _ := excelize.CoordinatesToCellName(1, i+2) f.SetSheetRow(sheet, cell, &row) } f.SaveAs(filename) } func getBaiduAccessToken() string { client := resty.New() url := "https://aip.baidubce.com/oauth/2.0/token" data := map[string]string{ "grant_type": "client_credentials", "client_id": apiKey, "client_secret": secretKey, } resp, err := client.R(). SetHeader("Content-Type", "application/x-www-form-urlencoded"). SetFormData(data). Post(url) if err != nil { panic(exception.New("获取AccessToken失败:" + err.Error())) } accessToken := gjson.Get(resp.String(), "access_token").String() return accessToken } func callOCRBaidu(b64, token string) []string { url := "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + token // 构造表单数据 payload := map[string]string{ "image": b64, "multidirectional_recognize": "true", } client := resty.New() resp, err := client.R(). SetHeader("Content-Type", "application/x-www-form-urlencoded"). SetHeader("Accept", "application/json"). SetFormData(payload). Post(url) if err != nil { panic(exception.New("请求百度OCR出错:" + err.Error())) } // 使用 gjson 解析结果 result := resp.String() var texts []string textResults := gjson.Get(result, "words_result") if !textResults.Exists() || !textResults.IsArray() { panic(exception.New(fmt.Errorf("no OCR results found").Error())) } textResults.ForEach(func(_, value gjson.Result) bool { text := value.Get("words").String() if text != "" { texts = append(texts, text) } return true }) return texts }