【爬虫实践】用递归获取网站的所有内链和外链
环境:Windows7 +Python3.6+Pycharm2017
目标:从一个网站的顶层开始,爬取该网站所有内链和外链,便于绘制网站地图!
通常网站的深度有5层左右的网页,广度有10个网页,所有大部分网站的页面数量都在10的5次方,就是10万个以内,但是python递归默认限制是1000,这就需要用sys模块的设置突破1000的限制。为了运行控制方便,这里增加了计数器变量iii,可也根据自己需要取消计数器。由于代码不长,也比较简单,直接上代码了!
`.
#coding=utf-8
from urllib.parse import urlparse
from urllib.request import Request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re,datetime,random
import sys
sys.setrecursionlimit(5000) #递归需要超过1000时的解决方式是手工设置递归调用深度,这里设置为5000
internalLinks = set()
externalLinks = set()
iii = 0
random.seed(datetime.datetime.now())
#获取内链
def getInternalLinks(includeUrl):
global internalLinks,iii
iii += 1
while iii <= 10:
data = Request(url=includeUrl,headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/535.19'})
html = urlopen(data)
# print(iii,html)
soup = BeautifulSoup(html,'html.parser')
# print(soup)
includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme,urlparse(includeUrl).netloc)
#在href中找/开头的链接
for link in soup.find_all('',href = re.compile('^(/|.*'+includeUrl+')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if (link.attrs['href'].startswith('/')):
internalLinks.add(includeUrl + link.attrs['href'])
getExternalLinks(soup,includeUrl)
getInternalLinks(includeUrl + link.attrs['href'])
else:
internalLinks.add(link.attrs['href'])
getExternalLinks(soup,includeUrl)
getInternalLinks(link.attrs['href'])
return
#获取外链
def getExternalLinks(soup,excludeUrl):
global externalLinks
#在href中找http开头或者www开头的且不含当前url的链接
for link in soup.find_all('a',href = re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.add(link.attrs['href'])
return
getInternalLinks('http://www.jindishoes.com')
print('总共获取了%d内部链接!'% len(internalLinks))
print('总共获取了%d内部链接!'% len(externalLinks))
print(internalLinks)
print(externalLinks)