1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| """ 抓取豆瓣250条数据
pip install beautifulsoup4 pip install lxml pip install requests
"""
import requests from bs4 import BeautifulSoup
class Movie(object): def __init__(self, no, title, pic, rate, url): self.no = no self.title = title self.pic = pic self.rate = rate self.url = url
def __str__(self): return "{}\t{}\t{}\t{}\t{}".format(self.no, self.title, self.pic, self.rate, self.url)
def fetch(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57 " } try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text except Exception as e: print(e) return None
def parse(html: str): bs = BeautifulSoup(html, "lxml") items = bs.select("div.item")
l = []
for item in items: no = item.select_one(".pic em").text title = item.select_one(".info .title").text pic = item.select_one(".pic img").attrs["src"] rate = item.select_one(".info .rating_num").text url = item.select_one(".pic a").attrs["href"] l.append(Movie(no, title, pic, rate, url))
return l
def save_to_text(movies): with open("./movies.txt", "w", encoding="utf8") as f: str = "" for movie in movies: str += movie.__str__() + "\n"
f.write(str) if __name__ == "__main__": all_movies = []
for i in range(0, 10): url = "" if i == 0: url = "https://movie.douban.com/top250" else: url = "https://movie.douban.com/top250?start={}".format(str(25 * i)) result = fetch(url) if result: moviles = parse(result) all_movies.extend(moviles)
save_to_text(all_movies)
|