python如何识别验证码中的图片,Python抓取登录验证码图片

　　首先导入一些用过的库：re，Image，pytesseract，selenium，time。

　　import #用于正则化。

　　Frompilimomage #用于打开和处理图片。

　　Importpytesseract #用于将图片转换为文本。

　　Fromseleniumimportwebdriver#用于打开网站。

　　当时间#码停止运行时，需要获取验证码图片才能进一步识别。

　　创建类是为了定义webdriver和find_element_by_selector方法，用于打开网页和定位验证码图片的元素。

　　classVerificationCode:

　　def__init__(self):

　　self.driver=webdriver。火狐()

　　自我。find _ element=自我。司机。find _ element _ by _ CSS _ selector然后打开浏览器抓取验证码图片。

　　defget_pictures(自拍):

　　self . driver . get( 3358123 . 255 . 123 . 3 )#打开登陆页面

　　self . driver . save _ screen shot( pictures . png )#全屏截图

　　page _ snap _ obj=image . open( pictures . png )

　　Img=self.find_element(#pic)#验证码元素的位置

　　时间.睡眠(1)

　　location=img.location

　　Size=img.size#获取验证码的大小参数

　　left=位置[x]

　　top=位置[y]

　　右=左尺寸[宽度]

　　bottom=top size[height]

　　image _ obj=page _ snap _ obj . crop((left，top，right，bottom)) #根据验证码的长度和宽度剪切验证码。

　　Image_obj.show()#切割后打开完整的验证码

　　Self.driver.close()#处理完验证码后关闭浏览器

　　returnimage_obj处理前的验证码图片如下：

　　对于python来说，未经处理的验证码图片识别率较低。如果你仔细观察，你会发现图像中有许多干扰识别率的彩色斑点，这极大地影响了识别率。

　　相关：《Python视频教程》

　　我们走吧

　　的验证码进行处理。

　　首先用convert把图片转成黑白色。设置threshold阈值，超过阈值的为黑色。

defprocessing_image(self):
　　image_obj=self.get_pictures()#获取验证码
　　img=image_obj.convert("L")#转灰度
　　pixdata=img.load()
　　w,h=img.size
　　threshold=160#该阈值不适合所有验证码，具体阈值请根据验证码情况设置
　　#遍历所有像素，大于阈值的为黑色
　　foryinrange(h):
　　forxinrange(w):
　　ifpixdata[x,y]<threshold:
　　pixdata[x,y]=0
　　else:
　　pixdata[x,y]=255
　　returnimg

经过灰度处理后的图片：

　　然后删除一些扰乱识别的像素点。

defdelete_spot(self):
　　images=self.processing_image()
　　data=images.getdata()
　　w,h=images.size
　　black_point=0
　　forxinrange(1,w-1):
　　foryinrange(1,h-1):
　　mid_pixel=data[w*y+x]#中央像素点像素值
　　ifmid_pixel<50:#找出上下左右四个方向像素点像素值
　　top_pixel=data[w*(y-1)+x]
　　left_pixel=data[w*y+(x-1)]
　　down_pixel=data[w*(y+1)+x]
　　right_pixel=data[w*y+(x+1)]
　　#判断上下左右的黑色像素点总个数
　　iftop_pixel<10:
　　black_point+=1
　　ifleft_pixel<10:
　　black_point+=1
　　ifdown_pixel<10:
　　black_point+=1
　　ifright_pixel<10:
　　black_point+=1
　　ifblack_point<1:
　　images.putpixel((x,y),255)
　　black_point=0
　　#images.show()
　　returnimages

经过去除噪点处理后的图片：

　　最后把处理后的图片转成文字。

　　先设置pytesseract的路径，因为默认路径是错的，然后转换图片为文字，由于个别图片中识别会出现处理遗漏，会被识别成空格或则点或则分号什么的，所以增加了一个去除验证码中特殊字符的处理。

defimage_str(self):
　　image=self.delete_spot()
　　pytesseract.pytesseract.tesseract_cmd=r"C:\ProgramFiles\Tesseract-OCR\tesseract.exe"#设置pyteseract路径
　　result=pytesseract.image_to_string(image)#图片转文字
　　resultj=re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","",result)#去除识别出来的特殊字符
　　result_four=resultj[0:4]#只获取前4个字符
　　#print(resultj)#打印识别的验证码
　　returnresult_four

完整代码如下：

importre#用于正则
　　fromPILimportImage#用于打开图片和对图片处理
　　importpytesseract#用于图片转文字
　　fromseleniumimportwebdriver#用于打开网站
　　importtime#代码运行停顿
　　
　　classVerificationCode:
　　def__init__(self):
　　self.driver=webdriver.Firefox()
　　self.find_element=self.driver.find_element_by_css_selector
　　
　　defget_pictures(self):
　　self.driver.get('http://123.255.123.3')#打开登陆页面
　　self.driver.save_screenshot('pictures.png')#全屏截图
　　page_snap_obj=Image.open('pictures.png')
　　img=self.find_element('#pic')#验证码元素位置
　　time.sleep(1)
　　location=img.location
　　size=img.size#获取验证码的大小参数
　　left=location['x']
　　top=location['y']
　　right=left+size['width']
　　bottom=top+size['height']
　　image_obj=page_snap_obj.crop((left,top,right,bottom))#按照验证码的长宽，切割验证码
　　image_obj.show()#打开切割后的完整验证码
　　self.driver.close()#处理完验证码后关闭浏览器
　　returnimage_obj
　　
　　defprocessing_image(self):
　　image_obj=self.get_pictures()#获取验证码
　　img=image_obj.convert("L")#转灰度
　　pixdata=img.load()
　　w,h=img.size
　　threshold=160
　　#遍历所有像素，大于阈值的为黑色
　　foryinrange(h):
　　forxinrange(w):
　　ifpixdata[x,y]<threshold:
　　pixdata[x,y]=0
　　else:
　　pixdata[x,y]=255
　　returnimg
　　
　　defdelete_spot(self):
　　images=self.processing_image()
　　data=images.getdata()
　　w,h=images.size
　　black_point=0
　　forxinrange(1,w-1):
　　foryinrange(1,h-1):
　　mid_pixel=data[w*y+x]#中央像素点像素值
　　ifmid_pixel<50:#找出上下左右四个方向像素点像素值
　　top_pixel=data[w*(y-1)+x]
　　left_pixel=data[w*y+(x-1)]
　　down_pixel=data[w*(y+1)+x]
　　right_pixel=data[w*y+(x+1)]
　　#判断上下左右的黑色像素点总个数
　　iftop_pixel<10:
　　black_point+=1
　　ifleft_pixel<10:
　　black_point+=1
　　ifdown_pixel<10:
　　black_point+=1
　　ifright_pixel<10:
　　black_point+=1
　　ifblack_point<1:
　　images.putpixel((x,y),255)
　　black_point=0
　　#images.show()
　　returnimages
　　
　　defimage_str(self):
　　image=self.delete_spot()
　　pytesseract.pytesseract.tesseract_cmd=r"C:\ProgramFiles\Tesseract-OCR\tesseract.exe"
　　#设置pyteseract路径
　　result=pytesseract.image_to_string(image)#图片转文字
　　resultj=re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","",result)
　　#去除识别出来的特殊字符
　　result_four=resultj[0:4]#只获取前4个字符
　　#print(resultj)#打印识别的验证码
　　returnresult_four
　　
　　if__name__=='__main__':
　　a=VerificationCode()
　　a.image_str()

郑重声明：本文由网友发布，不代表盛行IT的观点，版权归原作者所有，仅为传播更多信息之目的，如有侵权请联系，我们将第一时间修改或删除，多谢。

相关文章阅读