You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

414 lines
12 KiB
Go

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package cmd
import (
"encoding/base64"
"fmt"
"github.com/go-resty/resty/v2"
"github.com/mizuki1412/go-core-kit/class/exception"
"github.com/mizuki1412/go-core-kit/init/initkit"
"github.com/mizuki1412/go-core-kit/service/logkit"
"github.com/spf13/cobra"
"github.com/tidwall/gjson"
"github.com/xuri/excelize/v2"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"strings"
"time"
)
func init() {
rootCmd.AddCommand(auSimCmd)
}
var (
apiKey = "oEr6D60u6mmJRPlAHEIx8dWN"
secretKey = "r7fYVHExzsqlYO9P2kgdvs5N1WVJL8vB"
imageDir = "/Users/leo/Documents/au-ocr/image/"
ocrAPIURL = "http://127.0.0.1:5005/ocr"
outputExcelPath = "/Users/leo/Documents/au-ocr/excel/"
nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊", "军豪", "王皓", "丁磊", "吴喻飞", "秦俊杰", "俊杰", "刘继伟", "佳朴", "王浩", "游雨婷", "娅慧", "李浪", "奇乐", "张雨珈", "婧婧", "乞慧利", "郭婧婧", "王鑫", "兴凯"}
schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语", "浙大紫金港", "师范", "杭职", "杭师大", "树人", "成院", "城院", "师大", "开放", "中美院", "美院", "同济", "杭科"}
amountWhiteList = []string{"200", "100", "未充值", "50"}
)
type ExtractedInfo struct {
Name string
School string
Phone string
Amount string
}
var auSimCmd = &cobra.Command{
Use: "au",
Short: "Batch processing operations of the autumn semester sim card",
Run: func(cmd *cobra.Command, args []string) {
initkit.BindFlags(cmd)
files, err := ioutil.ReadDir(imageDir)
if err != nil {
panic(exception.New(err.Error()))
}
tokenBaidu := getBaiduAccessToken()
results := []ExtractedInfo{}
for _, file := range files {
if file.IsDir() || !strings.HasSuffix(file.Name(), ".jpg") || strings.Contains(file.Name(), "thumb.jpg") || strings.Contains(file.Name(), "hd.jpg") {
continue
}
fullPath := filepath.Join(imageDir, file.Name())
base64Str, err := imageToBase64(fullPath)
if err != nil {
panic(exception.New("转base64失败:" + err.Error()))
}
//第一次调用本地的
ocrText := callOCR(base64Str)
logkit.Info("成功调用本地OCR")
if err != nil {
panic(exception.New("OCR请求失败:" + err.Error()))
}
info := extractInfoFromOCR(ocrText, "")
//只要返回结果中存在空值的,就再调用另外的接口
if info.Name == "" || info.Phone == "" || info.School == "" || info.Amount == "" {
ocrTextBaidu := callOCRBaidu(base64Str, tokenBaidu)
logkit.Info("¥成功调用百度OCR")
infoBaidu := extractInfoFromOCR(ocrTextBaidu, "")
//合并
if info.Name == "" {
info.Name = infoBaidu.Name
}
if info.Phone == "" {
info.Phone = infoBaidu.Phone
}
if info.School == "" {
info.School = infoBaidu.School
}
if info.Amount == "" {
info.Amount = infoBaidu.Amount
}
}
//号码里包含了充值的这些敏感数字,就把号码从combined删除了重新再提取一次 //TODO 本地ocr识别不到 baidu能识别到? 把infoBaidu变量移出去
if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100") || strings.Contains(info.Phone, "50")) {
infoRedo := extractInfoFromOCR(ocrText, info.Phone)
info.Amount = infoRedo.Amount
}
if info.Phone != "" && containsBroadband(ocrText) {
newName := "broadband/" + file.Name()
newPath := filepath.Join(imageDir, newName)
os.Rename(fullPath, newPath)
info.Name = info.Name + "+宽带"
logkit.Info(info.Phone + " √√√加宽带!√√√")
}
oldPath := fullPath
newName := ""
//条件放宽到识别出号码就算成功,剩下不成功的 打?手动
if info.Phone != "" {
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别成功!")
newName = "success/" + fmt.Sprintf("%s%s.jpg", info.Name, info.Phone)
results = append(results, info)
} else {
logkit.Info("××× 业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别失败!")
newName = "fail/" + file.Name() + ".error.jpg"
}
newPath := filepath.Join(imageDir, newName)
os.Rename(oldPath, newPath)
}
writeExcel(results, outputExcelPath+"/"+time.Now().Format("20060102150405")+".xlsx")
},
}
func containsBroadband(texts []string) bool {
for _, text := range texts {
if strings.Contains(text, "宽带") {
return true
}
}
return false
}
func imageToBase64(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
return base64.StdEncoding.EncodeToString(data), nil
}
func callOCR(b64 string) []string {
client := resty.New()
resp, err := client.R().
SetHeader("Content-Type", "application/json").
SetBody(map[string]string{
"image": b64,
}).
Post(ocrAPIURL)
if err != nil {
panic(exception.New(err.Error()))
}
bodyStr := string(resp.Body())
// 快速检查 resultcode 是否为 200
if resp.StatusCode() != 200 {
panic(exception.New(fmt.Errorf("OCR failed: %s", gjson.Get(bodyStr, "message").String()).Error()))
}
// 使用 GJSON 提取所有文本字段
var texts []string
textResults := gjson.Get(bodyStr, "results")
if !textResults.Exists() || !textResults.IsArray() {
panic(exception.New(fmt.Errorf("no OCR results found").Error()))
}
textResults.ForEach(func(_, value gjson.Result) bool {
text := value.Get("text").String()
if text != "" {
texts = append(texts, text)
}
return true
})
return texts
}
func extractInfoFromOCR(texts []string, delPhone string) ExtractedInfo {
combined := strings.Join(texts, "")
if delPhone != "" {
combined = strings.Replace(combined, delPhone, "", -1)
}
info := ExtractedInfo{}
// 提取手机号
re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势|XH).*?(\d{11})`)
match := re.FindStringSubmatch(combined)
if len(match) >= 2 {
info.Phone = match[1]
}
// 提取业务员姓名
for _, name := range nameWhiteList {
if strings.Contains(combined, name) {
info.Name = name
context := getContext(combined, name, 30)
logkit.Info("【上下文 " + context + "】")
// 从上下文中查找学校
for _, school := range schoolWhiteList {
if strings.Contains(context, school) {
info.School = school
break
}
}
// 从上下文中查找金额
for _, amount := range amountWhiteList {
if strings.Contains(context, amount) {
info.Amount = amount
break
}
}
break
}
}
//如果到这一步业务员的名字还没找到,那么就放弃提取业务员名字上下文去找学校,改成直接全文去找学校,然后从学校获取上下文找首充
if info.Name == "" {
for _, school := range schoolWhiteList {
if strings.Contains(combined, school) {
info.School = school
context := getContext(combined, school, 30)
// 从上下文中查找金额
for _, amount := range amountWhiteList {
if strings.Contains(context, amount) {
info.Amount = amount
break
}
}
break
}
}
}
//如果到这一步业务员名字找到了,业务员上下文没有学校,那么还是直接全文去找学校
if info.Name != "" && info.School == "" {
for _, school := range schoolWhiteList {
if strings.Contains(combined, school) {
info.School = school
//如果到这里首充还是空白的,试着在学校上下文找首充
if info.Amount == "" {
context := getContext(combined, school, 30)
// 从上下文中查找金额
for _, amount := range amountWhiteList {
if strings.Contains(context, amount) {
info.Amount = amount
break
}
}
}
break
}
}
}
//如果到这一步充值金额还没找到那么可以以关键字去寻找充值金额适用于充值100充值200充值50这类关键词
if info.Amount == "" {
re2 := regexp.MustCompile(`充值[:]?\s*(100|200|50)\b`)
match2 := re2.FindStringSubmatch(combined)
if len(match2) >= 2 {
info.Amount = match2[1]
}
}
//到这里要判断一下是不是压根没充值还是没找到再去全文正则匹配纯100、200、50
if info.Amount == "" {
if strings.Contains(combined, "未充值") {
info.Amount = "未充值"
}
}
//如果到这一步充值金额还没找到再通过正则去找纯100、200、50
/*
"100", // 匹配
"200", // 匹配
"50", // 匹配
"1003", // 不匹配
"5004", // 不匹配
"2004", // 不匹配
"充值200", // 匹配
"充值2004", // 不匹配
"abc100xyz", // 匹配
*/
if info.Amount == "" {
re3 := regexp.MustCompile(`100|200|50`)
allMatches := re3.FindAllStringIndex(combined, -1)
for _, loc := range allMatches {
start, end := loc[0], loc[1]
beforeOK := start == 0 || (combined[start-1] < '0' || combined[start-1] > '9')
afterOK := end == len(combined) || (combined[end] < '0' || combined[end] > '9')
if beforeOK && afterOK {
info.Amount = combined[start:end]
break
}
}
}
return info
}
func getContext(text, keyword string, length int) string {
index := strings.Index(text, keyword)
if index == -1 {
return ""
}
start := index - length
if start < 0 {
start = 0
}
end := index + len(keyword) + length
if end > len(text) {
end = len(text)
}
return text[start:end]
}
func writeExcel(data []ExtractedInfo, filename string) {
f := excelize.NewFile()
sheet := "Sheet1"
f.SetSheetRow(sheet, "A1", &[]string{"学校", "业务员姓名", "手机号码", "充值金额"})
for i, d := range data {
if d.School == "万象" {
d.School = "万向"
}
if d.School == "外国语" {
d.School = "浙外"
}
if d.School == "浙大紫金港" {
d.School = "浙大"
}
if d.School == "师范" {
d.School = "杭师"
}
if d.School == "师大" {
d.School = "杭师"
}
if d.School == "杭师大" {
d.School = "杭师"
}
if d.School == "成院" {
d.School = "城院"
}
if d.School == "中美院" {
d.School = "美院"
}
if d.Amount == "未充值" {
d.Amount = "0"
}
if d.Amount == "" {
d.Amount = "?"
}
if d.Name == "" {
d.Name = "?"
}
if d.School == "" {
d.School = "?"
}
row := []string{d.School, d.Name, d.Phone, d.Amount}
cell, _ := excelize.CoordinatesToCellName(1, i+2)
f.SetSheetRow(sheet, cell, &row)
}
f.SaveAs(filename)
}
func getBaiduAccessToken() string {
client := resty.New()
url := "https://aip.baidubce.com/oauth/2.0/token"
data := map[string]string{
"grant_type": "client_credentials",
"client_id": apiKey,
"client_secret": secretKey,
}
resp, err := client.R().
SetHeader("Content-Type", "application/x-www-form-urlencoded").
SetFormData(data).
Post(url)
if err != nil {
panic(exception.New("获取AccessToken失败:" + err.Error()))
}
accessToken := gjson.Get(resp.String(), "access_token").String()
return accessToken
}
func callOCRBaidu(b64, token string) []string {
url := "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + token
// 构造表单数据
payload := map[string]string{
"image": b64,
"multidirectional_recognize": "true",
}
client := resty.New()
resp, err := client.R().
SetHeader("Content-Type", "application/x-www-form-urlencoded").
SetHeader("Accept", "application/json").
SetFormData(payload).
Post(url)
if err != nil {
panic(exception.New("请求百度OCR出错:" + err.Error()))
}
// 使用 gjson 解析结果
result := resp.String()
var texts []string
textResults := gjson.Get(result, "words_result")
if !textResults.Exists() || !textResults.IsArray() {
panic(exception.New(fmt.Errorf("no OCR results found").Error()))
}
textResults.ForEach(func(_, value gjson.Result) bool {
text := value.Get("words").String()
if text != "" {
texts = append(texts, text)
}
return true
})
return texts
}