|
|
package cmd
|
|
|
|
|
|
import (
|
|
|
"encoding/base64"
|
|
|
"fmt"
|
|
|
"github.com/go-resty/resty/v2"
|
|
|
"github.com/mizuki1412/go-core-kit/class/exception"
|
|
|
"github.com/mizuki1412/go-core-kit/init/initkit"
|
|
|
"github.com/mizuki1412/go-core-kit/service/logkit"
|
|
|
"github.com/spf13/cobra"
|
|
|
"github.com/tidwall/gjson"
|
|
|
"github.com/xuri/excelize/v2"
|
|
|
"io/ioutil"
|
|
|
"os"
|
|
|
"path/filepath"
|
|
|
"regexp"
|
|
|
"strings"
|
|
|
"time"
|
|
|
)
|
|
|
|
|
|
func init() {
|
|
|
rootCmd.AddCommand(auSimCmd)
|
|
|
}
|
|
|
|
|
|
var (
|
|
|
apiKey = "oEr6D60u6mmJRPlAHEIx8dWN"
|
|
|
secretKey = "r7fYVHExzsqlYO9P2kgdvs5N1WVJL8vB"
|
|
|
imageDir = "/Users/leo/Documents/au-ocr/image/"
|
|
|
ocrAPIURL = "http://127.0.0.1:5005/ocr"
|
|
|
outputExcelPath = "/Users/leo/Documents/au-ocr/excel/"
|
|
|
|
|
|
nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊", "军豪"}
|
|
|
schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语", "浙大紫金港"}
|
|
|
amountWhiteList = []string{"200", "100", "未充值", "50"}
|
|
|
)
|
|
|
|
|
|
type ExtractedInfo struct {
|
|
|
Name string
|
|
|
School string
|
|
|
Phone string
|
|
|
Amount string
|
|
|
}
|
|
|
|
|
|
var auSimCmd = &cobra.Command{
|
|
|
Use: "au",
|
|
|
Short: "Batch processing operations of the autumn semester sim card",
|
|
|
Run: func(cmd *cobra.Command, args []string) {
|
|
|
initkit.BindFlags(cmd)
|
|
|
files, err := ioutil.ReadDir(imageDir)
|
|
|
if err != nil {
|
|
|
panic(exception.New(err.Error()))
|
|
|
}
|
|
|
tokenBaidu := getBaiduAccessToken()
|
|
|
results := []ExtractedInfo{}
|
|
|
for _, file := range files {
|
|
|
if file.IsDir() || !strings.HasSuffix(file.Name(), ".jpg") || strings.Contains(file.Name(), "thumb.jpg") || strings.Contains(file.Name(), "hd.jpg") {
|
|
|
continue
|
|
|
}
|
|
|
fullPath := filepath.Join(imageDir, file.Name())
|
|
|
base64Str, err := imageToBase64(fullPath)
|
|
|
if err != nil {
|
|
|
panic(exception.New("转base64失败:" + err.Error()))
|
|
|
}
|
|
|
//第一次调用本地的
|
|
|
ocrText := callOCR(base64Str)
|
|
|
logkit.Info("成功调用本地OCR")
|
|
|
if err != nil {
|
|
|
panic(exception.New("OCR请求失败:" + err.Error()))
|
|
|
}
|
|
|
info := extractInfoFromOCR(ocrText)
|
|
|
//只要返回结果中存在空值的,就再调用另外的接口
|
|
|
if info.Name == "" || info.Phone == "" || info.School == "" || info.Amount == "" {
|
|
|
ocrTextBaidu := callOCRBaidu(base64Str, tokenBaidu)
|
|
|
logkit.Info("¥成功调用百度OCR")
|
|
|
infoBaidu := extractInfoFromOCR(ocrTextBaidu)
|
|
|
//合并
|
|
|
if info.Name == "" {
|
|
|
info.Name = infoBaidu.Name
|
|
|
}
|
|
|
if info.Phone == "" {
|
|
|
info.Phone = infoBaidu.Phone
|
|
|
}
|
|
|
if info.School == "" {
|
|
|
info.School = infoBaidu.School
|
|
|
}
|
|
|
if info.Amount == "" {
|
|
|
info.Amount = infoBaidu.Amount
|
|
|
}
|
|
|
}
|
|
|
//号码里包含了这些敏感数字
|
|
|
if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100") || strings.Contains(info.Phone, "50")) {
|
|
|
if info.Name != "" {
|
|
|
combined := strings.Join(ocrText, "")
|
|
|
context := getContext(combined, info.Name, 30)
|
|
|
if strings.Contains(context, info.Phone) {
|
|
|
newName := "fail/" + file.Name() + ".phone.jpg"
|
|
|
newPath := filepath.Join(imageDir, newName)
|
|
|
os.Rename(fullPath, newPath)
|
|
|
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 号码里包含充值敏感数字,可能会误识别!")
|
|
|
continue
|
|
|
}
|
|
|
} else {
|
|
|
newName := "fail/" + file.Name() + ".phone.jpg"
|
|
|
newPath := filepath.Join(imageDir, newName)
|
|
|
os.Rename(fullPath, newPath)
|
|
|
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 号码里包含充值敏感数字,可能会误识别!")
|
|
|
continue
|
|
|
}
|
|
|
}
|
|
|
if info.Phone != "" && containsBroadband(ocrText) {
|
|
|
newName := "broadband/" + file.Name()
|
|
|
newPath := filepath.Join(imageDir, newName)
|
|
|
os.Rename(fullPath, newPath)
|
|
|
info.Name = info.Name + "+宽带"
|
|
|
logkit.Info(info.Phone + " √√√加宽带!√√√")
|
|
|
}
|
|
|
oldPath := fullPath
|
|
|
newName := ""
|
|
|
//条件放宽到识别出号码就算成功,剩下不成功的 打?手动
|
|
|
if info.Phone != "" {
|
|
|
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别成功!")
|
|
|
newName = "success/" + fmt.Sprintf("%s%s.jpg", info.Name, info.Phone)
|
|
|
results = append(results, info)
|
|
|
} else {
|
|
|
logkit.Info("××× 业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别失败!")
|
|
|
newName = "fail/" + file.Name() + ".error.jpg"
|
|
|
}
|
|
|
|
|
|
newPath := filepath.Join(imageDir, newName)
|
|
|
os.Rename(oldPath, newPath)
|
|
|
|
|
|
}
|
|
|
|
|
|
writeExcel(results, outputExcelPath+"/"+time.Now().Format("20060102150405")+".xlsx")
|
|
|
},
|
|
|
}
|
|
|
|
|
|
func containsBroadband(texts []string) bool {
|
|
|
for _, text := range texts {
|
|
|
if strings.Contains(text, "宽带") {
|
|
|
return true
|
|
|
}
|
|
|
}
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
func imageToBase64(path string) (string, error) {
|
|
|
data, err := os.ReadFile(path)
|
|
|
if err != nil {
|
|
|
return "", err
|
|
|
}
|
|
|
return base64.StdEncoding.EncodeToString(data), nil
|
|
|
}
|
|
|
|
|
|
func callOCR(b64 string) []string {
|
|
|
client := resty.New()
|
|
|
resp, err := client.R().
|
|
|
SetHeader("Content-Type", "application/json").
|
|
|
SetBody(map[string]string{
|
|
|
"image": b64,
|
|
|
}).
|
|
|
Post(ocrAPIURL)
|
|
|
|
|
|
if err != nil {
|
|
|
panic(exception.New(err.Error()))
|
|
|
}
|
|
|
bodyStr := string(resp.Body())
|
|
|
// 快速检查 resultcode 是否为 200
|
|
|
if resp.StatusCode() != 200 {
|
|
|
panic(exception.New(fmt.Errorf("OCR failed: %s", gjson.Get(bodyStr, "message").String()).Error()))
|
|
|
}
|
|
|
// 使用 GJSON 提取所有文本字段
|
|
|
var texts []string
|
|
|
textResults := gjson.Get(bodyStr, "results")
|
|
|
|
|
|
if !textResults.Exists() || !textResults.IsArray() {
|
|
|
panic(exception.New(fmt.Errorf("no OCR results found").Error()))
|
|
|
}
|
|
|
|
|
|
textResults.ForEach(func(_, value gjson.Result) bool {
|
|
|
text := value.Get("text").String()
|
|
|
if text != "" {
|
|
|
texts = append(texts, text)
|
|
|
}
|
|
|
return true
|
|
|
})
|
|
|
return texts
|
|
|
}
|
|
|
|
|
|
func extractInfoFromOCR(texts []string) ExtractedInfo {
|
|
|
combined := strings.Join(texts, "")
|
|
|
info := ExtractedInfo{}
|
|
|
// 提取手机号
|
|
|
re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势|XH).*?(\d{11})`)
|
|
|
match := re.FindStringSubmatch(combined)
|
|
|
if len(match) >= 2 {
|
|
|
info.Phone = match[1]
|
|
|
}
|
|
|
|
|
|
// 提取业务员姓名
|
|
|
for _, name := range nameWhiteList {
|
|
|
if strings.Contains(combined, name) {
|
|
|
info.Name = name
|
|
|
context := getContext(combined, name, 30)
|
|
|
logkit.Info("【上下文 " + context + "】")
|
|
|
// 从上下文中查找学校
|
|
|
for _, school := range schoolWhiteList {
|
|
|
if strings.Contains(context, school) {
|
|
|
info.School = school
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// 从上下文中查找金额
|
|
|
for _, amount := range amountWhiteList {
|
|
|
if strings.Contains(context, amount) {
|
|
|
info.Amount = amount
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
//如果到这一步业务员的名字还没找到,那么就放弃提取业务员名字上下文去找学校,改成直接全文去找学校,然后从学校获取上下文找首充
|
|
|
if info.Name == "" {
|
|
|
for _, school := range schoolWhiteList {
|
|
|
if strings.Contains(combined, school) {
|
|
|
info.School = school
|
|
|
context := getContext(combined, school, 30)
|
|
|
// 从上下文中查找金额
|
|
|
for _, amount := range amountWhiteList {
|
|
|
if strings.Contains(context, amount) {
|
|
|
info.Amount = amount
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
//如果到这一步业务员名字找到了,业务员上下文没有学校,那么还是直接全文去找学校
|
|
|
if info.Name != "" && info.School == "" {
|
|
|
for _, school := range schoolWhiteList {
|
|
|
if strings.Contains(combined, school) {
|
|
|
info.School = school
|
|
|
//如果到这里首充还是空白的,试着在学校上下文找首充
|
|
|
if info.Amount == "" {
|
|
|
context := getContext(combined, school, 30)
|
|
|
// 从上下文中查找金额
|
|
|
for _, amount := range amountWhiteList {
|
|
|
if strings.Contains(context, amount) {
|
|
|
info.Amount = amount
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
//如果到这一步充值金额还没找到,那么可以以关键字去寻找充值金额,适用于充值100,充值200,充值50这类关键词
|
|
|
if info.Amount == "" {
|
|
|
re2 := regexp.MustCompile(`充值[::]?\s*(100|200|50)\b`)
|
|
|
match2 := re2.FindStringSubmatch(combined)
|
|
|
if len(match2) >= 2 {
|
|
|
info.Amount = match2[1]
|
|
|
}
|
|
|
}
|
|
|
//到这里要判断一下是不是压根没充值,还是没找到再去全文正则匹配纯100、200、50
|
|
|
if info.Amount == "" {
|
|
|
if strings.Contains(combined, "未充值") {
|
|
|
info.Amount = "未充值"
|
|
|
}
|
|
|
}
|
|
|
|
|
|
//如果到这一步充值金额还没找到,再通过正则去找纯100、200、50
|
|
|
/*
|
|
|
"100", // 匹配
|
|
|
"200", // 匹配
|
|
|
"50", // 匹配
|
|
|
"1003", // 不匹配
|
|
|
"5004", // 不匹配
|
|
|
"2004", // 不匹配
|
|
|
"充值200", // 匹配
|
|
|
"充值2004", // 不匹配
|
|
|
"abc100xyz", // 匹配
|
|
|
*/
|
|
|
if info.Amount == "" {
|
|
|
re3 := regexp.MustCompile(`100|200|50`)
|
|
|
allMatches := re3.FindAllStringIndex(combined, -1)
|
|
|
for _, loc := range allMatches {
|
|
|
start, end := loc[0], loc[1]
|
|
|
beforeOK := start == 0 || (combined[start-1] < '0' || combined[start-1] > '9')
|
|
|
afterOK := end == len(combined) || (combined[end] < '0' || combined[end] > '9')
|
|
|
if beforeOK && afterOK {
|
|
|
info.Amount = combined[start:end]
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return info
|
|
|
}
|
|
|
|
|
|
func getContext(text, keyword string, length int) string {
|
|
|
index := strings.Index(text, keyword)
|
|
|
if index == -1 {
|
|
|
return ""
|
|
|
}
|
|
|
start := index - length
|
|
|
if start < 0 {
|
|
|
start = 0
|
|
|
}
|
|
|
end := index + len(keyword) + length
|
|
|
if end > len(text) {
|
|
|
end = len(text)
|
|
|
}
|
|
|
return text[start:end]
|
|
|
}
|
|
|
|
|
|
func writeExcel(data []ExtractedInfo, filename string) {
|
|
|
f := excelize.NewFile()
|
|
|
sheet := "Sheet1"
|
|
|
f.SetSheetRow(sheet, "A1", &[]string{"学校", "业务员姓名", "手机号码", "充值金额"})
|
|
|
|
|
|
for i, d := range data {
|
|
|
if d.School == "万象" {
|
|
|
d.School = "万向"
|
|
|
}
|
|
|
if d.School == "外国语" {
|
|
|
d.School = "浙外"
|
|
|
}
|
|
|
if d.School == "浙大紫金港" {
|
|
|
d.School = "浙大"
|
|
|
}
|
|
|
if d.Amount == "未充值" {
|
|
|
d.Amount = "0"
|
|
|
}
|
|
|
if d.Amount == "" {
|
|
|
d.Amount = "?"
|
|
|
}
|
|
|
if d.Name == "" {
|
|
|
d.Name = "?"
|
|
|
}
|
|
|
if d.School == "" {
|
|
|
d.School = "?"
|
|
|
}
|
|
|
row := []string{d.School, d.Name, d.Phone, d.Amount}
|
|
|
cell, _ := excelize.CoordinatesToCellName(1, i+2)
|
|
|
f.SetSheetRow(sheet, cell, &row)
|
|
|
}
|
|
|
|
|
|
f.SaveAs(filename)
|
|
|
}
|
|
|
|
|
|
func getBaiduAccessToken() string {
|
|
|
client := resty.New()
|
|
|
url := "https://aip.baidubce.com/oauth/2.0/token"
|
|
|
data := map[string]string{
|
|
|
"grant_type": "client_credentials",
|
|
|
"client_id": apiKey,
|
|
|
"client_secret": secretKey,
|
|
|
}
|
|
|
resp, err := client.R().
|
|
|
SetHeader("Content-Type", "application/x-www-form-urlencoded").
|
|
|
SetFormData(data).
|
|
|
Post(url)
|
|
|
if err != nil {
|
|
|
panic(exception.New("获取AccessToken失败:" + err.Error()))
|
|
|
}
|
|
|
accessToken := gjson.Get(resp.String(), "access_token").String()
|
|
|
return accessToken
|
|
|
}
|
|
|
|
|
|
func callOCRBaidu(b64, token string) []string {
|
|
|
url := "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + token
|
|
|
// 构造表单数据
|
|
|
payload := map[string]string{
|
|
|
"image": b64,
|
|
|
"multidirectional_recognize": "true",
|
|
|
}
|
|
|
client := resty.New()
|
|
|
resp, err := client.R().
|
|
|
SetHeader("Content-Type", "application/x-www-form-urlencoded").
|
|
|
SetHeader("Accept", "application/json").
|
|
|
SetFormData(payload).
|
|
|
Post(url)
|
|
|
if err != nil {
|
|
|
panic(exception.New("请求百度OCR出错:" + err.Error()))
|
|
|
}
|
|
|
|
|
|
// 使用 gjson 解析结果
|
|
|
result := resp.String()
|
|
|
var texts []string
|
|
|
textResults := gjson.Get(result, "words_result")
|
|
|
|
|
|
if !textResults.Exists() || !textResults.IsArray() {
|
|
|
panic(exception.New(fmt.Errorf("no OCR results found").Error()))
|
|
|
}
|
|
|
textResults.ForEach(func(_, value gjson.Result) bool {
|
|
|
text := value.Get("words").String()
|
|
|
if text != "" {
|
|
|
texts = append(texts, text)
|
|
|
}
|
|
|
return true
|
|
|
})
|
|
|
return texts
|
|
|
}
|