From 18df4b3afef536dc305baad1499281bacac6ede6 Mon Sep 17 00:00:00 2001 From: Leo Date: Tue, 10 Jun 2025 17:31:35 +0800 Subject: [PATCH] =?UTF-8?q?update:=20au=5Fsim=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/au_sim.go | 103 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/cmd/au_sim.go b/cmd/au_sim.go index f633c6a..377909e 100644 --- a/cmd/au_sim.go +++ b/cmd/au_sim.go @@ -23,6 +23,8 @@ func init() { } var ( + apiKey = "oEr6D60u6mmJRPlAHEIx8dWN" + secretKey = "r7fYVHExzsqlYO9P2kgdvs5N1WVJL8vB" imageDir = "/Users/leo/Documents/au-ocr/image/" ocrAPIURL = "http://127.0.0.1:5005/ocr" outputExcelPath = "/Users/leo/Documents/au-ocr/excel/" @@ -48,6 +50,7 @@ var auSimCmd = &cobra.Command{ if err != nil { panic(exception.New(err.Error())) } + tokenBaidu := getBaiduAccessToken() results := []ExtractedInfo{} for _, file := range files { if file.IsDir() || !strings.HasSuffix(file.Name(), ".jpg") || strings.Contains(file.Name(), "thumb.jpg") || strings.Contains(file.Name(), "hd.jpg") { @@ -58,12 +61,32 @@ var auSimCmd = &cobra.Command{ if err != nil { panic(exception.New("转base64失败:" + err.Error())) } + //第一次调用本地的 ocrText := callOCR(base64Str) + logkit.Info("成功调用本地OCR") if err != nil { panic(exception.New("OCR请求失败:" + err.Error())) } - info := extractInfoFromOCR(ocrText) + //只要返回结果中存在空值的,就再调用另外的接口 + if info.Name == "" || info.Phone == "" || info.School == "" || info.Amount == "" { + ocrTextBaidu := callOCRBaidu(base64Str, tokenBaidu) + logkit.Info("¥成功调用百度OCR") + infoBaidu := extractInfoFromOCR(ocrTextBaidu) + //合并 + if info.Name == "" { + info.Name = infoBaidu.Name + } + if info.Phone == "" { + info.Phone = infoBaidu.Phone + } + if info.School == "" { + info.School = infoBaidu.School + } + if info.Amount == "" { + info.Amount = infoBaidu.Amount + } + } //号码里包含了这些敏感数字 if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100") || strings.Contains(info.Phone, "50")) { if info.Name != "" { @@ -143,7 +166,6 @@ func callOCR(b64 string) []string { } bodyStr := string(resp.Body()) // 快速检查 resultcode 是否为 200 - if resp.StatusCode() != 200 { panic(exception.New(fmt.Errorf("OCR failed: %s", gjson.Get(bodyStr, "message").String()).Error())) } @@ -168,11 +190,10 @@ func callOCR(b64 string) []string { func extractInfoFromOCR(texts []string) ExtractedInfo { combined := strings.Join(texts, "") info := ExtractedInfo{} - // 提取手机号 - re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势).*?(\d{11})`) + re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势|XH).*?(\d{11})`) match := re.FindStringSubmatch(combined) - if len(match) == 2 { + if len(match) >= 2 { info.Phone = match[1] } @@ -243,7 +264,7 @@ func extractInfoFromOCR(texts []string) ExtractedInfo { if info.Amount == "" { re2 := regexp.MustCompile(`充值[::]?\s*(100|200|50)\b`) match2 := re2.FindStringSubmatch(combined) - if len(match2) == 2 { + if len(match2) >= 2 { info.Amount = match2[1] } } @@ -264,13 +285,19 @@ func extractInfoFromOCR(texts []string) ExtractedInfo { "2004", // 不匹配 "充值200", // 匹配 "充值2004", // 不匹配 - "abc100xyz", // 不匹配 + "abc100xyz", // 匹配 */ if info.Amount == "" { - re3 := regexp.MustCompile(`\b(100|200|50)\b`) - match3 := re3.FindStringSubmatch(combined) - if len(match3) == 2 { - info.Amount = match3[1] + re3 := regexp.MustCompile(`100|200|50`) + allMatches := re3.FindAllStringIndex(combined, -1) + for _, loc := range allMatches { + start, end := loc[0], loc[1] + beforeOK := start == 0 || (combined[start-1] < '0' || combined[start-1] > '9') + afterOK := end == len(combined) || (combined[end] < '0' || combined[end] > '9') + if beforeOK && afterOK { + info.Amount = combined[start:end] + break + } } } @@ -327,3 +354,57 @@ func writeExcel(data []ExtractedInfo, filename string) { f.SaveAs(filename) } + +func getBaiduAccessToken() string { + client := resty.New() + url := "https://aip.baidubce.com/oauth/2.0/token" + data := map[string]string{ + "grant_type": "client_credentials", + "client_id": apiKey, + "client_secret": secretKey, + } + resp, err := client.R(). + SetHeader("Content-Type", "application/x-www-form-urlencoded"). + SetFormData(data). + Post(url) + if err != nil { + panic(exception.New("获取AccessToken失败:" + err.Error())) + } + accessToken := gjson.Get(resp.String(), "access_token").String() + return accessToken +} + +func callOCRBaidu(b64, token string) []string { + url := "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + token + // 构造表单数据 + payload := map[string]string{ + "image": b64, + "multidirectional_recognize": "true", + } + client := resty.New() + resp, err := client.R(). + SetHeader("Content-Type", "application/x-www-form-urlencoded"). + SetHeader("Accept", "application/json"). + SetFormData(payload). + Post(url) + if err != nil { + panic(exception.New("请求百度OCR出错:" + err.Error())) + } + + // 使用 gjson 解析结果 + result := resp.String() + var texts []string + textResults := gjson.Get(result, "words_result") + + if !textResults.Exists() || !textResults.IsArray() { + panic(exception.New(fmt.Errorf("no OCR results found").Error())) + } + textResults.ForEach(func(_, value gjson.Result) bool { + text := value.Get("words").String() + if text != "" { + texts = append(texts, text) + } + return true + }) + return texts +}