cu-helper/cmd/au_sim.go

package cmd

import (
	"encoding/base64"
	"fmt"
	"github.com/go-resty/resty/v2"
	"github.com/mizuki1412/go-core-kit/class/exception"
	"github.com/mizuki1412/go-core-kit/init/initkit"
	"github.com/mizuki1412/go-core-kit/service/logkit"
	"github.com/spf13/cobra"
	"github.com/tidwall/gjson"
	"github.com/xuri/excelize/v2"
	"io/ioutil"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"time"
)

func init() {
	rootCmd.AddCommand(auSimCmd)
}

var (
	apiKey          = "oEr6D60u6mmJRPlAHEIx8dWN"
	secretKey       = "r7fYVHExzsqlYO9P2kgdvs5N1WVJL8vB"
	imageDir        = "/Users/leo/Documents/au-ocr/image/"
	ocrAPIURL       = "http://127.0.0.1:5005/ocr"
	outputExcelPath = "/Users/leo/Documents/au-ocr/excel/"

	nameWhiteList   = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊", "军豪", "王皓", "丁磊", "吴喻飞", "秦俊杰", "俊杰", "刘继伟", "佳朴", "王浩", "游雨婷", "娅慧", "李浪", "奇乐", "张雨珈", "婧婧", "乞慧利", "郭婧婧", "王鑫", "兴凯"}
	schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语", "浙大紫金港", "师范", "杭职", "杭师大", "树人", "成院", "城院", "师大", "开放", "中美院", "美院", "同济", "杭科"}
	amountWhiteList = []string{"200", "100", "未充值", "50"}
)

type ExtractedInfo struct {
	Name   string
	School string
	Phone  string
	Amount string
}

var auSimCmd = &cobra.Command{
	Use:   "au",
	Short: "Batch processing operations of the autumn semester sim card",
	Run: func(cmd *cobra.Command, args []string) {
		initkit.BindFlags(cmd)
		files, err := ioutil.ReadDir(imageDir)
		if err != nil {
			panic(exception.New(err.Error()))
		}
		tokenBaidu := getBaiduAccessToken()
		results := []ExtractedInfo{}
		for _, file := range files {
			if file.IsDir() || !strings.HasSuffix(file.Name(), ".jpg") || strings.Contains(file.Name(), "thumb.jpg") || strings.Contains(file.Name(), "hd.jpg") {
				continue
			}
			fullPath := filepath.Join(imageDir, file.Name())
			base64Str, err := imageToBase64(fullPath)
			if err != nil {
				panic(exception.New("转base64失败:" + err.Error()))
			}
			//第一次调用本地的
			ocrText := callOCR(base64Str)
			logkit.Info("成功调用本地OCR")
			if err != nil {
				panic(exception.New("OCR请求失败:" + err.Error()))
			}
			info := extractInfoFromOCR(ocrText, "")
			//只要返回结果中存在空值的，就再调用另外的接口
			if info.Name == "" || info.Phone == "" || info.School == "" || info.Amount == "" {
				ocrTextBaidu := callOCRBaidu(base64Str, tokenBaidu)
				logkit.Info("￥成功调用百度OCR")
				infoBaidu := extractInfoFromOCR(ocrTextBaidu, "")
				//合并
				if info.Name == "" {
					info.Name = infoBaidu.Name
				}
				if info.Phone == "" {
					info.Phone = infoBaidu.Phone
				}
				if info.School == "" {
					info.School = infoBaidu.School
				}
				if info.Amount == "" {
					info.Amount = infoBaidu.Amount
				}
			}
			//号码里包含了充值的这些敏感数字,就把号码从combined删除了重新再提取一次 //TODO 本地ocr识别不到 baidu能识别到? 把infoBaidu变量移出去
			if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100") || strings.Contains(info.Phone, "50")) {
				infoRedo := extractInfoFromOCR(ocrText, info.Phone)
				info.Amount = infoRedo.Amount
			}
			if info.Phone != "" && containsBroadband(ocrText) {
				newName := "broadband/" + file.Name()
				newPath := filepath.Join(imageDir, newName)
				os.Rename(fullPath, newPath)
				info.Name = info.Name + "+宽带"
				logkit.Info(info.Phone + " √√√加宽带!√√√")
			}
			oldPath := fullPath
			newName := ""
			//条件放宽到识别出号码就算成功，剩下不成功的 打?手动
			if info.Phone != "" {
				logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别成功!")
				newName = "success/" + fmt.Sprintf("%s%s.jpg", info.Name, info.Phone)
				results = append(results, info)
			} else {
				logkit.Info("××× 业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别失败!")
				newName = "fail/" + file.Name() + ".error.jpg"
			}

			newPath := filepath.Join(imageDir, newName)
			os.Rename(oldPath, newPath)

		}

		writeExcel(results, outputExcelPath+"/"+time.Now().Format("20060102150405")+".xlsx")
	},
}

func containsBroadband(texts []string) bool {
	for _, text := range texts {
		if strings.Contains(text, "宽带") {
			return true
		}
	}
	return false
}

func imageToBase64(path string) (string, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return "", err
	}
	return base64.StdEncoding.EncodeToString(data), nil
}

func callOCR(b64 string) []string {
	client := resty.New()
	resp, err := client.R().
		SetHeader("Content-Type", "application/json").
		SetBody(map[string]string{
			"image": b64,
		}).
		Post(ocrAPIURL)

	if err != nil {
		panic(exception.New(err.Error()))
	}
	bodyStr := string(resp.Body())
	// 快速检查 resultcode 是否为 200
	if resp.StatusCode() != 200 {
		panic(exception.New(fmt.Errorf("OCR failed: %s", gjson.Get(bodyStr, "message").String()).Error()))
	}
	// 使用 GJSON 提取所有文本字段
	var texts []string
	textResults := gjson.Get(bodyStr, "results")

	if !textResults.Exists() || !textResults.IsArray() {
		panic(exception.New(fmt.Errorf("no OCR results found").Error()))
	}

	textResults.ForEach(func(_, value gjson.Result) bool {
		text := value.Get("text").String()
		if text != "" {
			texts = append(texts, text)
		}
		return true
	})
	return texts
}

func extractInfoFromOCR(texts []string, delPhone string) ExtractedInfo {
	combined := strings.Join(texts, "")
	if delPhone != "" {
		combined = strings.Replace(combined, delPhone, "", -1)
	}
	info := ExtractedInfo{}
	// 提取手机号
	re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势|XH).*?(\d{11})`)
	match := re.FindStringSubmatch(combined)
	if len(match) >= 2 {
		info.Phone = match[1]
	}

	// 提取业务员姓名
	for _, name := range nameWhiteList {
		if strings.Contains(combined, name) {
			info.Name = name
			context := getContext(combined, name, 30)
			logkit.Info("【上下文 " + context + "】")
			// 从上下文中查找学校
			for _, school := range schoolWhiteList {
				if strings.Contains(context, school) {
					info.School = school
					break
				}
			}

			// 从上下文中查找金额
			for _, amount := range amountWhiteList {
				if strings.Contains(context, amount) {
					info.Amount = amount
					break
				}
			}

			break
		}
	}
	//如果到这一步业务员的名字还没找到，那么就放弃提取业务员名字上下文去找学校，改成直接全文去找学校，然后从学校获取上下文找首充
	if info.Name == "" {
		for _, school := range schoolWhiteList {
			if strings.Contains(combined, school) {
				info.School = school
				context := getContext(combined, school, 30)
				// 从上下文中查找金额
				for _, amount := range amountWhiteList {
					if strings.Contains(context, amount) {
						info.Amount = amount
						break
					}
				}
				break
			}
		}
	}
	//如果到这一步业务员名字找到了，业务员上下文没有学校，那么还是直接全文去找学校
	if info.Name != "" && info.School == "" {
		for _, school := range schoolWhiteList {
			if strings.Contains(combined, school) {
				info.School = school
				//如果到这里首充还是空白的，试着在学校上下文找首充
				if info.Amount == "" {
					context := getContext(combined, school, 30)
					// 从上下文中查找金额
					for _, amount := range amountWhiteList {
						if strings.Contains(context, amount) {
							info.Amount = amount
							break
						}
					}
				}
				break
			}
		}
	}

	//如果到这一步充值金额还没找到，那么可以以关键字去寻找充值金额，适用于充值100，充值200，充值50这类关键词
	if info.Amount == "" {
		re2 := regexp.MustCompile(`充值[:：]?\s*(100|200|50)\b`)
		match2 := re2.FindStringSubmatch(combined)
		if len(match2) >= 2 {
			info.Amount = match2[1]
		}
	}
	//到这里要判断一下是不是压根没充值，还是没找到再去全文正则匹配纯100、200、50
	if info.Amount == "" {
		if strings.Contains(combined, "未充值") {
			info.Amount = "未充值"
		}
	}

	//如果到这一步充值金额还没找到，再通过正则去找纯100、200、50
	/*
		"100",      // 匹配
		"200",      // 匹配
		"50",       // 匹配
		"1003",     // 不匹配
		"5004",     // 不匹配
		"2004",     // 不匹配
		"充值200",   // 匹配
		"充值2004",  // 不匹配
		"abc100xyz", // 匹配
	*/
	if info.Amount == "" {
		re3 := regexp.MustCompile(`100|200|50`)
		allMatches := re3.FindAllStringIndex(combined, -1)
		for _, loc := range allMatches {
			start, end := loc[0], loc[1]
			beforeOK := start == 0 || (combined[start-1] < '0' || combined[start-1] > '9')
			afterOK := end == len(combined) || (combined[end] < '0' || combined[end] > '9')
			if beforeOK && afterOK {
				info.Amount = combined[start:end]
				break
			}
		}
	}

	return info
}

func getContext(text, keyword string, length int) string {
	index := strings.Index(text, keyword)
	if index == -1 {
		return ""
	}
	start := index - length
	if start < 0 {
		start = 0
	}
	end := index + len(keyword) + length
	if end > len(text) {
		end = len(text)
	}
	return text[start:end]
}

func writeExcel(data []ExtractedInfo, filename string) {
	f := excelize.NewFile()
	sheet := "Sheet1"
	f.SetSheetRow(sheet, "A1", &[]string{"学校", "业务员姓名", "手机号码", "充值金额"})

	for i, d := range data {
		if d.School == "万象" {
			d.School = "万向"
		}
		if d.School == "外国语" {
			d.School = "浙外"
		}
		if d.School == "浙大紫金港" {
			d.School = "浙大"
		}
		if d.School == "师范" {
			d.School = "杭师"
		}
		if d.School == "师大" {
			d.School = "杭师"
		}
		if d.School == "杭师大" {
			d.School = "杭师"
		}
		if d.School == "成院" {
			d.School = "城院"
		}
		if d.School == "中美院" {
			d.School = "美院"
		}
		if d.Amount == "未充值" {
			d.Amount = "0"
		}
		if d.Amount == "" {
			d.Amount = "?"
		}
		if d.Name == "" {
			d.Name = "?"
		}
		if d.School == "" {
			d.School = "?"
		}
		row := []string{d.School, d.Name, d.Phone, d.Amount}
		cell, _ := excelize.CoordinatesToCellName(1, i+2)
		f.SetSheetRow(sheet, cell, &row)
	}

	f.SaveAs(filename)
}

func getBaiduAccessToken() string {
	client := resty.New()
	url := "https://aip.baidubce.com/oauth/2.0/token"
	data := map[string]string{
		"grant_type":    "client_credentials",
		"client_id":     apiKey,
		"client_secret": secretKey,
	}
	resp, err := client.R().
		SetHeader("Content-Type", "application/x-www-form-urlencoded").
		SetFormData(data).
		Post(url)
	if err != nil {
		panic(exception.New("获取AccessToken失败:" + err.Error()))
	}
	accessToken := gjson.Get(resp.String(), "access_token").String()
	return accessToken
}

func callOCRBaidu(b64, token string) []string {
	url := "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + token
	// 构造表单数据
	payload := map[string]string{
		"image":                      b64,
		"multidirectional_recognize": "true",
	}
	client := resty.New()
	resp, err := client.R().
		SetHeader("Content-Type", "application/x-www-form-urlencoded").
		SetHeader("Accept", "application/json").
		SetFormData(payload).
		Post(url)
	if err != nil {
		panic(exception.New("请求百度OCR出错:" + err.Error()))
	}

	// 使用 gjson 解析结果
	result := resp.String()
	var texts []string
	textResults := gjson.Get(result, "words_result")

	if !textResults.Exists() || !textResults.IsArray() {
		panic(exception.New(fmt.Errorf("no OCR results found").Error()))
	}
	textResults.ForEach(func(_, value gjson.Result) bool {
		text := value.Get("words").String()
		if text != "" {
			texts = append(texts, text)
		}
		return true
	})
	return texts
}