先利用浏览器的开发者选项获取表格的位置
确定位置后利用request库和BeautifulSoup库进行简单的爬取
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
kv = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
body = soup.body
data = body.find(attrs={'class': 'billboard-bd'})
trs = data.find_all('tr')
for tr in trs:
name = tr.find('a').string
ulist.append(name)
return ulist
def printUnivList(ulist):
num = 1
for i in ulist:
print('{:>2}: {:<}'.format(num, i))
num += 1
def main():
uinfo = []
url = "https://movie.douban.com/"
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo)
main()