-
Notifications
You must be signed in to change notification settings - Fork 0
/
MovieSpider.py
100 lines (91 loc) · 4.01 KB
/
MovieSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding=utf-8 -*-'''
# 豆瓣正在上映电影以及影评
from urllib import request
from bs4 import BeautifulSoup
import re
import time
import jieba #分词包
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy
import spider.login
def get_movie_list(page):
URL = "https://movie.douban.com/cinema/nowplaying/beijing/"
resp = request.urlopen(URL)
htmldata = resp.read().decode("utf-8")
soup = BeautifulSoup(htmldata, "html.parser")
nowplaying = soup.find_all("div", id="nowplaying")
movieList = nowplaying[0].find_all('li', class_='list-item')
listItem = []
for item in movieList:
nowplaying_dict = {'id': item['id'], 'data-title': item['data-title'], 'data-score': item['data-score'],
'data-actors': item['data-actors']}
# print(item['id'],item['data-title'],item['data-score'])
movieharf = item.find_all('a', class_="ticket-btn")
# for href in movieharf:
nowplaying_dict['href'] = movieharf[0]['href']
listItem.append(nowplaying_dict)
print(listItem[0]['id'])
return listItem
def get_movie_commnts(movie_list_id, start):
comments_url = 'https://movie.douban.com/subject/' + movie_list_id + '/comments?start=' + str(
start) + '&limit=20'
resp_comments = request.urlopen(comments_url)
comments_data = resp_comments.read().decode("utf-8")
comments_soup = BeautifulSoup(comments_data, "html.parser")
comments = comments_soup.find_all("div", id="comments")
commentsList = comments[0].find_all('div', class_='comment-item')
count = comments_soup.find('li', class_='is-active')
count_str = count.find('span').get_text()
num = re.findall(r'[^()]+', count_str)[1]
commentsItem = []
for comments_item in commentsList:
comments_dict = {}
# comments_dict['comments']=
comments_all = comments_item.find_all('p', class_="")[0].get_text()
commentsItem.append(comments_all)
return commentsItem
if __name__ == "__main__":
# page = spider.login.loggin()
page =None
movie_list = get_movie_list(page)
name =[]
for l in movie_list:
name = l['id']
content = []
for i in range(0, 10):
start = i * 20
content.append(get_movie_commnts(l['id'], start))
fh = open(l['data-title']+".txt", 'w', encoding='utf8')
str_value = ''
cleaned_comments =''
for s in range(len(content)):
str_value = str_value + (str(content[s])).strip()
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, str_value)
cleaned_comments =cleaned_comments+ ''.join(filterdata)
fh.write(cleaned_comments)
time.sleep(10)
fh.close()
# print(cleaned_comments)
# 去掉停用词
segment = jieba._lcut(cleaned_comments)
word_count = pd.DataFrame({'segment':segment})
stop_words = pd.read_csv("file\stopwords.txt",index_col=False,quoting=3, sep="\t", names=['stopword'],encoding='utf-8')
word_count = word_count[~word_count.segment.isin(stop_words.stopword)]
# 统计词频
words_stat = word_count.groupby(by=['segment'])['segment'].agg({"size"})
words_stat = words_stat.reset_index().sort_values(by=["size"], ascending=False)
word_count = word_count.groupby(['segment']).size()
# print(word_count.sort_values(ascending=False))
# 用词云进行显示
wordcloud = WordCloud(font_path="file\simhei.ttf", background_color="white", max_font_size=80) # 指定字体类型、字体大小和字体颜色
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)
wordcloud = wordcloud.fit_words(word_frequence_list)
plt.imsave(l['id']+'img.jpg', wordcloud)
plt.imshow(wordcloud, interpolation='bilinear')