-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_links.py
95 lines (79 loc) · 3.04 KB
/
get_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import logging
import sys
from selenium import webdriver
import time
import argparse
def get_links_from_page(my_webpage):
"""
get links for competitions from one page
:param my_webpage: link to page for scrapping
:return: links from one page
"""
competition_links = []
for i in range(1, 20):
try:
compet = my_webpage.find_element_by_xpath(
'//*[@id="root"]/div/div[1]/div[2]/div/div/div[2]/div[2]/div[2]/a[' + str(i) + ']')
except:
# print('not now')
logger.exception("Can't get link id= " + str(i))
continue
competition_links.append(compet.get_attribute("href"))
logger.debug('Collected competition links from one page')
return competition_links
def connect():
"""
create chrome driver
:return: chrome driver
"""
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(chrome_options=options, executable_path='./chromedriver')
driver.get("https://www.kaggle.com/search?q=in%3Acompetitions")
time.sleep(5)
return driver
def get_links_from_site(driver, num_pages=450):
"""
get competition links from kaggle site
:param driver: chrome driver
:param num_pages: number of pages to scrap
:return: list of links
"""
competition_links = []
for i in range(num_pages):
competition_links += get_links_from_page(driver)
logger.info('Collected `links` from page ' + str(i+1))
driver.find_element_by_xpath(
'//*[@id="root"]/div/div[1]/div[2]/div/div/div[2]/div[2]/div[3]/div/button[2]').click()
time.sleep(5)
logger.info('Collected all competition links from pages')
return competition_links
def extract_links_to_file(file_name):
"""
extract links to .txt file
:param file_name: output filename
:return: nothing
"""
kaggle_driver = connect()
competition_links = get_links_from_site(kaggle_driver)
output_comp_links = open(file_name, 'w')
for link in competition_links:
output_comp_links.write(link + '\n')
output_comp_links.close()
logger.info('Save link to file finished')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Exctract links for competitions from kaggle.com')
parser.add_argument('--links_file', type=str, help='Where store the scrapped links of competitions', action="store",
default='kaggle_links.txt')
args = parser.parse_args()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('get_links.log')
file_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.info("Start to collect competition links from Kaggle.com")
extract_links_to_file(args.links_file)
logger.info("Main in get_links.py is finished")