抓取豆瓣250条数据

抓取豆瓣250条数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
抓取豆瓣250条数据

pip install beautifulsoup4
pip install lxml
pip install requests

"""



import requests
from bs4 import BeautifulSoup


class Movie(object):
def __init__(self, no, title, pic, rate, url):
self.no = no
self.title = title
self.pic = pic
self.rate = rate
self.url = url

def __str__(self):
return "{}\t{}\t{}\t{}\t{}".format(self.no, self.title, self.pic, self.rate, self.url)


def fetch(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57 "
}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except Exception as e:
print(e)
return None


def parse(html: str):
bs = BeautifulSoup(html, "lxml")
items = bs.select("div.item")

l = []

for item in items:
no = item.select_one(".pic em").text
title = item.select_one(".info .title").text
pic = item.select_one(".pic img").attrs["src"]
rate = item.select_one(".info .rating_num").text
url = item.select_one(".pic a").attrs["href"]
# print(no, tittle, pic, rate, url)
l.append(Movie(no, title, pic, rate, url))

return l

def save_to_text(movies):
with open("./movies.txt", "w", encoding="utf8") as f:
str = ""
for movie in movies:
str += movie.__str__() + "\n"

f.write(str)

if __name__ == "__main__":
all_movies = []

for i in range(0, 10):
url = ""
if i == 0:
url = "https://movie.douban.com/top250"
else:
url = "https://movie.douban.com/top250?start={}".format(str(25 * i))
result = fetch(url)
if result:
moviles = parse(result)
all_movies.extend(moviles)

save_to_text(all_movies)

作者

建指所向

发布于

2021-12-20

更新于

2023-11-07

许可协议