python 爬虫案例

import requests
import sys
import json
import re
import time
import pymysql
import hashlib
import os
import urllib.request
from requests.exceptions import RequestException
from urllib.parse import unquote,quote
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
import random
import warnings
warnings.filterwarnings('ignore')
def ext(path):
return os.path.splitext(path)[1]

def md5str(str):
m1 = hashlib.md5()
m1.update(str.encode("utf-8"))
return m1.hexdigest()

def cookie():
file = open('C:/cookie1.txt','r')
content = file.read()
file.close()
return content

def getip():
headers = {'content-type': 'application/json'}
ret = requests.get('http://dps.kdlapi.com/api/getdps/?orderid=916257388705561&num=1&pt=1&format=json&sep=1', headers=headers)
file = open("C:/ip3.txt", "w")
json_data=json.loads(ret.text)
file.write(json_data['data']['proxy_list'][0])
file.close()

def ip():
file = open('C:/ip3.txt','r')
content = file.read()
file.close()
return content

def parse_content(urlstr):
db = pymysql.connect("120.77.225.157","root","xxxxx","feiya")
cursor = db.cursor()
md5 = md5str(urlstr)
sql = "SELECT * FROM web_enterprise_url WHERE md5='"+md5+"'"
cursor.execute(sql)
results = cursor.fetchall()
db.close()
if results:
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
'Cookie':cookie()
}
time.sleep(10)
print(ip())
proxies = {"http": 'http://'+ip()}
try:
response=requests.get(urlstr, headers=headers, proxies=proxies)
except:
print("程序报错了")
soup=BeautifulSoup(response.text,'html.parser')
doc=pq(response.text)
table = doc("#Cominfo").find("table")
name = doc("h1").text().replace("\n","").replace(" ","")

phone = ''
if soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"}) and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[0] and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[0].find("span"):
phone = soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[0].find("span").get_text().strip().replace("\n","").replace(" ","")
web_url = ''
if soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"}) and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[1] and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[1].find("a"):
web_url = soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[1].find("a").get_text().strip().replace("\n","").replace(" ","")
email = ''
if soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"}) and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[2] and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[2].find("a"):
email = soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[2].find("a").get_text().strip().replace("\n","").replace(" ","")
address = ''
if soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"}) and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[3] and soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[3].find("a"):
address = soup.find("div",attrs={"class": "dcontent"}).find_all("span",attrs={"class": "cvlu"})[3].find("a").get_text().strip().replace("\n","").replace(" ","")

legal = ''
if soup.find("div",attrs={"class": "bpen"}) and soup.find("div",attrs={"class": "bpen"}).find("h2"):
legal = soup.find("div",attrs={"class": "bpen"}).find("h2").get_text().strip().replace("\n","").replace(" ","")

nodes = table('td')
capital = nodes[3].text.replace("\n","").replace(" ","")
payin = nodes[5].text.replace("\n","").replace(" ","")
status = nodes[7].text.replace("\n","").replace(" ","")
found_time = nodes[9].text.replace("\n","").replace(" ","")
credit_code = nodes[11].text.replace("\n","").replace(" ","")
discern_code = nodes[13].text.replace("\n","").replace(" ","")
reg_num = nodes[15].text.replace("\n","").replace(" ","")
organize_code = nodes[17].text.replace("\n","").replace(" ","")
type = nodes[19].text.replace("\n","").replace(" ","")
industry = nodes[21].text.replace("\n","").replace(" ","")
approved = nodes[23].text.replace("\n","").replace(" ","")

organ = nodes[25].text.replace("\n","").replace(" ","")
area = nodes[27].text.replace("\n","").replace(" ","")
enname = nodes[29].text.replace("\n","").replace(" ","")
usedname = nodes[31].text.replace("\n","").replace(" ","")
insured = nodes[33].text.replace("\n","").replace(" ","")
scale = nodes[35].text.replace("\n","").replace(" ","")
open_time = nodes[37].text.replace("\n","").replace(" ","")
scope = nodes[41].text.replace("\n","").replace(" ","")
qualification = ''
descs = ''
info = []
info.append(name)

info.append(capital)
info.append(found_time)
info.append(status)
info.append(reg_num)
info.append(credit_code)
info.append(organize_code)
info.append(type)
info.append(discern_code)
info.append(open_time)
info.append(industry)
info.append(approved)
info.append(payin)
info.append(scale)
info.append(insured)
info.append(organ)
info.append(usedname)
info.append(enname)
info.append(scope);

db = pymysql.connect("120.77.225.157","root","$20180626Feiyaoa$18062","feiya")
cursor = db.cursor()
sql = "INSERT INTO web_enterprise_business(name, legal, email, phone, web_url, capital, found_time, reg_num, credit_code, organize_code, type, discern_code, open_time, industry,qualification, approved, payin, scale, insured, organ, address,usedname, url, enname, status, scope, descs, platform) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
sqlurl = "INSERT INTO web_enterprise_url(md5) VALUES(%s)"
try:
cursor.execute(sql, (name, legal, email, phone, web_url, capital, found_time, reg_num, credit_code, organize_code, type, discern_code, open_time, industry,qualification, approved, payin, scale, insured, organ, address,usedname, urlstr, enname, status,scope, descs, 2))
cursor.execute(sqlurl, (md5))
db.commit()
except:
db.rollback()
db.close()
print(info)
return True

def spider(urlstr):

print("开始爬行:"+urlstr)

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Cookie':cookie()
}

time.sleep(10)

proxies = {"http": 'http://'+ip()}

try:
response=requests.get(urlstr, headers=headers, proxies=proxies)
except:
print("程序报错了")
return
soup=BeautifulSoup(response.text,'html.parser')
nodes = soup.find_all("a")

spider_urls = []
if nodes != None and nodes:
for node in nodes:
try:
if node["href"].find('firm_') >-1 and node["href"].find('#') <0 and node["href"].find('?') <0:
print("解析页面:https://www.qichacha.com"+node['href'])
parse_content("https://www.qichacha.com"+node['href'])
elif node["href"].find('search') > -1 and node["href"].find('search') >-1:
spider_urls.append(node["href"])
except:
print("程序报错了")
continue
return
def main():
words = ["汉","字","表","丂","丄","丅","丆"]
sorts = ['','startdate-true','startdate-false','registcapi-false','registcapi-true']
for m in range(0,len(words)):
for i in range(0,len(sorts)):
for page in range(1,250):
url = 'https://www.qichacha.com/search_index?key=广州'+words[m]+'&p='+str(page)+"&ajaxflag=1&sortField="+sorts[i]
spider(url)

if __name__ == '__main__':
main()