-
Notifications
You must be signed in to change notification settings - Fork 2
/
YouminAnalysi.py
157 lines (131 loc) · 6.52 KB
/
YouminAnalysi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pyhdb
import datetime
import requests
import re
import datetime
def get_connect(): # 连接数据库信息
conn_obj = pyhdb.connect(
host='10.150****',
port = 30015,
user = 'sa***p2',
password = 'S**0'
)
return conn_obj
def intohana_grandcat(conn,id,cate,site):
cursor = conn.cursor()
cursor.execute('insert into ZANG_TEST_GRANDCAT (GRANDCAT_ID,GRANDCAT_NAME,SITE,UPDATE_DATE) values(%s,%s,%s,%s)',(id,cate,site,current_date))
def intohana_game(conn,grand_id,grand_name,game_id,game_name,site):
cursor = conn.cursor()
cursor.execute('insert into ZANG_TEST_GAM (GRAND_ID,GRAND_NAME,GAME_ID,GAME_NAME,GAME_SITE,UPDATE_DATE) values(%s,%s,%s,%s,%s,%s) ',(grand_id,grand_name,game_id,game_name,site,current_date))
updateSQL="insert into SAPABAP2.ZANG_TEST_GAM_NEW (select a.GRAND_ID,a.GRAND_NAME,a.GAME_ID,a.GAME_NAME,a.GAME_SITE,a.UPDATE_DATE from SAPABAP2.ZANG_TEST_GAM a where a.GAME_ID not in (select GAME_ID from SAPABAP2.ZANG_TEST_GAM_NEW)) "
cursor.execute(updateSQL)
def intohana_blogdetail(conn,GRAND_ID,GAME_ID,GAME_NAME,BLOG_TYPE,BLOG_THEME,REPLY_NUM,
READ_NUM,CREATOR,CREATE_TIME,LAST_REPLIER,LAST_REPLY_TIME,UPDATE_TIME):
cursor = conn.cursor()
cursor.execute('insert into ZANG_TEST_BLOG_DETAIL (GRAND_ID,GAME_ID,GAME_NAME,BLOG_TYPE,BLOG_THEME,REPLY_NUM,READ_NUM,CREATOR,CREATE_TIME,LAST_REPLIER,LAST_REPLY_TIME,UPDATE_TIME) '
'values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ',
(GRAND_ID,GAME_ID,GAME_NAME,BLOG_TYPE,BLOG_THEME,REPLY_NUM,
READ_NUM,CREATOR,CREATE_TIME,LAST_REPLIER,LAST_REPLY_TIME,UPDATE_TIME))
updateSQL="insert into SAPABAP2.ZANG_TEST_BLOG_DETAIL_NEW (select GRAND_ID,GAME_ID,GAME_NAME,BLOG_TYPE,BLOG_THEME,REPLY_NUM,READ_NUM,CREATOR,CREATE_TIME,LAST_REPLIER,LAST_REPLY_TIME,UPDATE_TIME from SAPABAP2.ZANG_TEST_BLOG_DETAIL where BLOG_THEME not in (select BLOG_THEME from SAPABAP2.ZANG_TEST_BLOG_DETAIL_NEW)) "
cursor.execute(updateSQL)
def delete_grandcat(conn):
cursor = conn.cursor()
cursor.execute('truncate table ZANG_TEST_GRANDCAT')
cursor.execute('truncate table ZANG_TEST_GAM')
cursor.execute('truncate table ZANG_TEST_BLOG_DETAIL')
def check_contain_chinese(check_str):
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return True
return False
def getgrand(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} # 设置头文件信息 #
response = requests.get(url, headers=headers).content # 提交requests get 请求
soup = BeautifulSoup(response, "html.parser") # 用Beautifulsoup 进行解析
commid = soup.findAll('a', class_='xi2')
for commid2 in commid[2:-1]: ##爬取论坛首页
href=commid2.get("href")
if len(href.split("-"))>=2 :
id=href.split("-")[1]
site=url+href
print(site)
cate=commid2.text
if check_contain_chinese(cate) == True:
response2 = requests.get(site, headers=headers).content # 提交requests get 请求
soup2= BeautifulSoup(response2, "html.parser") # 用Beautifulsoup 进行解析
catmid=soup2.findAll('dt')
intohana_grandcat(conn, id, cate, site)
# print(catmid)
# print(cate)
for catmid2 in catmid: ##爬取每个游戏类型首页
a=catmid2.findAll("a")[0]
href2=a.get("href")
gamename=a.text
# print(gamename)
if (len(href2.split("-"))>=2) & (href2[-4:]=="html") :
site2=url+href2
id2=href2.split("-")[1]
# print(a)
# print(id2)
# print(site2)
intohana_game(conn, id, cate, id2, gamename, site2)
getdetail(site2,conn,id,id2,gamename) ## get detail of blog information
def getdetail(site,conn,GRAND_ID,GAME_ID,GAME_NAME): ## get detail of blog information
globals()
type=''
theme=''
replynum=''
readnum=''
editor=''
createdate=''
lastreply=''
lastreplydate=''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} # 设置头文件信息 #
response3 = requests.get(site, headers=headers).content # 提交requests get 请求
soup3 = BeautifulSoup(response3, "html.parser") # 用Beautifulsoup 进行解析
gamemid = soup3.findAll('th', class_='new')
td = soup3.findAll('td', class_='by')
for tbody in soup3.findAll('tbody'):
for tr in tbody.findAll('tr'):
for new in tr.findAll('th', class_='new'): ##get title
em = new.find('em')
if em:
type = em.text[1:-1]
else:
type = ''
theme = new.find('a', class_='s xst').text
num = tr.find('td', class_='num') ###get reply & read
if num:
replynum = num.find('a').text
readnum = num.find('em').text
else:
replynum='0'
readnum='0'
# print(type, theme, replynum, readnum)
by = tr.findAll('td', class_='by') # get editor & date
if by:
for uby in by[:1]: # get editor & date
createdate = uby.find('em').text
editor = uby.find('cite').text
# print(editor,createdate)
for uby in by[1:]: # get editor & date
lastreply = uby.find('cite').text
lastreplydate = uby.find('em').text
# print(editor, createdate, lastreply, lastreplydate)
# print(lastreply, lastreplydate)
intohana_blogdetail(conn,GRAND_ID,GAME_ID,GAME_NAME,type,theme,replynum,
readnum,editor,createdate,lastreply,lastreplydate,current_daytime)
if __name__ == '__main__':
globals()
conn = get_connect()
current_date = datetime.datetime.now()
current_daytime = current_date.strftime('%Y-%m-%d %H:%M:%S')
url = 'http://bbs.3dmgame.com/'
delete_grandcat(conn)
getgrand(url)
conn.commit()
print('Update complted')