所谓静态网站就是从网页源代码里面找到所需要内容,那么我们怎么从这样网页中抓取需要的数据呢
步骤思路:
import urllib.request
import re
import xlwt
def getWebSiteData():
data_list = []
for i in range(26700, 26800):
# 要爬取的网址
url = 'http://www.risfond.com/case/fmcg/{}'.format(i)
# 打开url并读取源代码
html = urllib.request.urlopen(url).read().decode('utf-8')
# print(html) # 打印源代码
page_list = re.findall('.*?(.*?)', html) # 用正则获取符合条件的数据
# print(page_list)
data_list.append(page_list)
return data_list
def excel_write(items):
newTable = '高儿夫.xls'
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet('wpf')
headData = ['职位名称', '职位地点', '时间', '行业', '招聘时间', '人数', '顾问']
for column in range(0, 7):
ws.write(0,column,headData[column],xlwt.easyxf('font:bold on'))
index = 1
for j in range(0, len(items)):
for i in range(0, 7):
ws.write(index, i, items[j][i])
index+=1
wb.save(newTable)
items = getWebSiteData()
excel_write(items)
爬取数据简单实现,有问题可随时联系