图片转表格

图片转表格

2024-12-06

Python

暂无评论

32 次阅读

同事的一个需求，将图片里的表格转为Excel

import cv2

from PIL import Image

import pytesseract

import pandas as pd

# 加载图片

image_path = 'sj2.png'

img = cv2.imread(image_path)

# 转换为灰度图

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# 应用阈值化

_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)

# 保存处理后的图片，用于后续 OCR

processed_image_path = 'processed_image.png'

cv2.imwrite(processed_image_path, thresh)

# 使用 pytesseract 进行 OCR

text = pytesseract.image_to_string(processed_image_path)

#print(text)

# 解析文本为数据框

lines = text.split('\n')

data = []

t1=''

t2=''

for line in lines:

fields = line.split(' ')[1:]

if fields:

for f in fields[::-1]:#反转遍历值，正向遍历时，有跳过现象

if not (('.' in f) or ('-' in f)):

fields.remove(f)

#补全缺失的日期

if len(fields)==5:

print(fields)

t1=fields[0][0:5]

t2=str(int(fields[0][-2:])+1)

print(t2)

if len(fields)<5:

fields.insert(0,t1+'-'+t2)

for i,f in enumerate(fields):#反转遍历值，正向遍历时，有跳过现象

if '.' in f:

fields[i]=int(f.split('.')[0])#去掉最后的点

data.append(fields)

# 创建数据框

df = pd.DataFrame(data[1:])

print(data[1:])

# 将数据框导出为 Excel 文件

excel_path = 'output.xlsx'

df.to_excel(excel_path, index=False, header=False)

标签: none

添加新评论

最新文章

最近回复

分类

标签

归档

其它