You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

304 lines
9.2 KiB
Go

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package cmd
import (
"encoding/base64"
"fmt"
"github.com/go-resty/resty/v2"
"github.com/mizuki1412/go-core-kit/class/exception"
"github.com/mizuki1412/go-core-kit/init/initkit"
"github.com/mizuki1412/go-core-kit/service/logkit"
"github.com/spf13/cobra"
"github.com/tidwall/gjson"
"github.com/xuri/excelize/v2"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"strings"
"time"
)
func init() {
rootCmd.AddCommand(auSimCmd)
}
var (
imageDir = "/Users/leo/Documents/au-ocr/image/"
ocrAPIURL = "http://127.0.0.1:5005/ocr"
outputExcelPath = "/Users/leo/Documents/au-ocr/excel/"
nameWhiteList = []string{"德宁", "鼎斌", "海波", "何晓璇", "胡鑫", "佳欣", "佳怡", "嘉乐", "建闽", "锦城", "景浩", "君豪", "凯彬", "兰青", "李想", "林艳", "刘美云", "裴雅妮", "任子健", "润宇", "隆蝶", "石明毅", "覃彩玉", "唐鑫", "唐宇豪", "童斌", "万兴凯", "王权", "吴宇峰", "武文迪", "夏晨阳", "项乐奇", "小锐", "小颖", "晓雪", "谢俊", "欣萍", "鑫杰", "徐宁", "许慧超", "雅妮", "杨传杰", "杨帅", "杨笑笑", "杨兴俊", "叶琪婷", "宇飞", "玉梅", "张钧帅", "张奕韬", "张玉", "张原硕", "章帅", "赵林冲", "郑佳欣", "朱菲玲", "子健", "邹思惠", "徐林焱", "周志乐", "林焱", "许慧超", "嘉辉", "楚俊", "军豪"}
schoolWhiteList = []string{"财经", "工商", "工业", "杭电", "计量", "金融", "经济", "经贸", "科技", "理工", "美院", "万向", "长征", "浙音", "万象", "特殊教育", "外国语", "浙大紫金港"}
amountWhiteList = []string{"200", "100", "未充值", "50"}
)
type ExtractedInfo struct {
Name string
School string
Phone string
Amount string
}
var auSimCmd = &cobra.Command{
Use: "au",
Short: "Batch processing operations of the autumn semester sim card",
Run: func(cmd *cobra.Command, args []string) {
initkit.BindFlags(cmd)
files, err := ioutil.ReadDir(imageDir)
if err != nil {
panic(exception.New(err.Error()))
}
results := []ExtractedInfo{}
for _, file := range files {
if file.IsDir() || !strings.HasSuffix(file.Name(), ".jpg") || strings.Contains(file.Name(), "thumb.jpg") || strings.Contains(file.Name(), "hd.jpg") {
continue
}
fullPath := filepath.Join(imageDir, file.Name())
base64Str, err := imageToBase64(fullPath)
if err != nil {
panic(exception.New("转base64失败:" + err.Error()))
}
ocrText := callOCR(base64Str)
if err != nil {
panic(exception.New("OCR请求失败:" + err.Error()))
}
info := extractInfoFromOCR(ocrText)
//号码里包含了这些敏感数字
if info.Phone != "" && (strings.Contains(info.Phone, "200") || strings.Contains(info.Phone, "100") || strings.Contains(info.Phone, "50")) {
if info.Name != "" {
combined := strings.Join(ocrText, "")
context := getContext(combined, info.Name, 30)
if strings.Contains(context, info.Phone) {
newName := "fail/" + file.Name() + ".phone.jpg"
newPath := filepath.Join(imageDir, newName)
os.Rename(fullPath, newPath)
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 号码里包含充值敏感数字,可能会误识别!")
continue
}
} else {
newName := "fail/" + file.Name() + ".phone.jpg"
newPath := filepath.Join(imageDir, newName)
os.Rename(fullPath, newPath)
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 号码里包含充值敏感数字,可能会误识别!")
continue
}
}
if info.Phone != "" && containsBroadband(ocrText) {
newName := "broadband/" + file.Name()
newPath := filepath.Join(imageDir, newName)
os.Rename(fullPath, newPath)
info.Name = info.Name + "+宽带"
logkit.Info(info.Phone + " √√√加宽带!√√√")
}
oldPath := fullPath
newName := ""
//条件放宽到识别出号码就算成功,剩下不成功的 打?手动
if info.Phone != "" {
logkit.Info("业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别成功!")
newName = "success/" + fmt.Sprintf("%s%s.jpg", info.Name, info.Phone)
results = append(results, info)
} else {
logkit.Info("××× 业务员:" + info.Name + " 学校:" + info.School + " 首充:" + info.Amount + " 号码:" + info.Phone + " 识别失败!")
newName = "fail/" + file.Name() + ".error.jpg"
}
newPath := filepath.Join(imageDir, newName)
os.Rename(oldPath, newPath)
}
writeExcel(results, outputExcelPath+"/"+time.Now().Format("20060102150405")+".xlsx")
},
}
func containsBroadband(texts []string) bool {
for _, text := range texts {
if strings.Contains(text, "宽带") {
return true
}
}
return false
}
func imageToBase64(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
return base64.StdEncoding.EncodeToString(data), nil
}
func callOCR(b64 string) []string {
client := resty.New()
resp, err := client.R().
SetHeader("Content-Type", "application/json").
SetBody(map[string]string{
"image": b64,
}).
Post(ocrAPIURL)
if err != nil {
panic(exception.New(err.Error()))
}
bodyStr := string(resp.Body())
// 快速检查 resultcode 是否为 200
if resp.StatusCode() != 200 {
panic(exception.New(fmt.Errorf("OCR failed: %s", gjson.Get(bodyStr, "message").String()).Error()))
}
// 使用 GJSON 提取所有文本字段
var texts []string
textResults := gjson.Get(bodyStr, "results")
if !textResults.Exists() || !textResults.IsArray() {
panic(exception.New(fmt.Errorf("no OCR results found").Error()))
}
textResults.ForEach(func(_, value gjson.Result) bool {
text := value.Get("text").String()
if text != "" {
texts = append(texts, text)
}
return true
})
return texts
}
func extractInfoFromOCR(texts []string) ExtractedInfo {
combined := strings.Join(texts, "")
info := ExtractedInfo{}
// 提取手机号
re := regexp.MustCompile(`(?:XH[^\x00-\xff]|H势|势).*?(\d{11})`)
match := re.FindStringSubmatch(combined)
if len(match) == 2 {
info.Phone = match[1]
}
// 提取业务员姓名
for _, name := range nameWhiteList {
if strings.Contains(combined, name) {
info.Name = name
context := getContext(combined, name, 30)
logkit.Info("【上下文 " + context + "】")
// 从上下文中查找学校
for _, school := range schoolWhiteList {
if strings.Contains(context, school) {
info.School = school
break
}
}
// 从上下文中查找金额
for _, amount := range amountWhiteList {
if strings.Contains(context, amount) {
info.Amount = amount
break
}
}
break
}
}
//如果到这一步业务员的名字还没找到,那么就放弃提取业务员名字上下文去找学校,改成直接全文去找学校,然后从学校获取上下文找首充
if info.Name == "" {
for _, school := range schoolWhiteList {
if strings.Contains(combined, school) {
info.School = school
context := getContext(combined, school, 30)
// 从上下文中查找金额
for _, amount := range amountWhiteList {
if strings.Contains(context, amount) {
info.Amount = amount
break
}
}
break
}
}
}
//如果到这一步业务员名字找到了,业务员上下文没有学校,那么还是直接全文去找学校
if info.Name != "" && info.School == "" {
for _, school := range schoolWhiteList {
if strings.Contains(combined, school) {
info.School = school
//如果到这里首充还是空白的,试着在学校上下文找首充
if info.Amount == "" {
context := getContext(combined, school, 30)
// 从上下文中查找金额
for _, amount := range amountWhiteList {
if strings.Contains(context, amount) {
info.Amount = amount
break
}
}
}
break
}
}
}
//如果到这一步充值金额还没找到那么可以以关键字去寻找充值金额适用于充值100充值200充值50这类关键词
if info.Amount == "" {
re2 := regexp.MustCompile(`充值[:]?\s*(100|200|50)\b`)
match2 := re2.FindStringSubmatch(combined)
if len(match2) == 2 {
info.Amount = match2[1]
}
}
return info
}
func getContext(text, keyword string, length int) string {
index := strings.Index(text, keyword)
if index == -1 {
return ""
}
start := index - length
if start < 0 {
start = 0
}
end := index + len(keyword) + length
if end > len(text) {
end = len(text)
}
return text[start:end]
}
func writeExcel(data []ExtractedInfo, filename string) {
f := excelize.NewFile()
sheet := "Sheet1"
f.SetSheetRow(sheet, "A1", &[]string{"学校", "业务员姓名", "手机号码", "充值金额"})
for i, d := range data {
if d.School == "万象" {
d.School = "万向"
}
if d.School == "外国语" {
d.School = "浙外"
}
if d.School == "浙大紫金港" {
d.School = "浙大"
}
if d.Amount == "未充值" {
d.Amount = "0"
}
if d.Amount == "" {
d.Amount = "?"
}
if d.Name == "" {
d.Name = "?"
}
if d.School == "" {
d.School = "?"
}
row := []string{d.School, d.Name, d.Phone, d.Amount}
cell, _ := excelize.CoordinatesToCellName(1, i+2)
f.SetSheetRow(sheet, cell, &row)
}
f.SaveAs(filename)
}