-
Notifications
You must be signed in to change notification settings - Fork 2
/
AlchemyData_Scraper.py
50 lines (43 loc) · 1.71 KB
/
AlchemyData_Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
import xml.etree.cElementTree as ET
from lxml import html
import requests
from urlparse import urlparse
API_KEY = '16f12887c8dab2166c6bec9f62ccc3030342a016'
def main():
url_list=[]
# The original query URL. As a free user of the API, I was not allowed to dynamically call the API more than the set limit
# url = 'https://access.alchemyapi.com/calls/data/GetNews?apikey=16f12887c8dab2166c6bec9f62ccc3030342a016&return=enriched.url.url&start=1487462400&end=1488150000&q.enriched.url.enrichedTitle.taxonomy.taxonomy_.label=law,%20govt%20and%20politics&count=100&outputMode=json'
tree = ET.parse('C:\Users\eagle\Documents\GitHub\DragonSource\AlchemyXMLResult(1).txt')
root = tree.getroot()
for x in xrange(987):
try:
url = root[3][0][x][1][0][0][0].text[1:-1]
print url
url_list.append(url)
except IndexError:
print 'Index Error at Element %d' % x
print url_list
for x in url_list: get_content(x)
def get_content(link):
try:
page = requests.get(link)
print '%s seen' % link
tree = html.fromstring(page.content)
words = tree.xpath('//p/text()')
main_story = [x.encode('ascii', 'ignore').encode('utf-8') for x in words if len(x) > 100]
name = re.findall('\w*', urlparse(link).netloc)[2]
file_output(name, main_story)
print '%s has been processed' % link
except:
pass
def file_output(source, content_list):
file_name = '%s.txt' % name
file_new = open(file_name, 'w')
file_new.write(name)
file_new.write('\n')
file_new.close()
file_append = open(file_name, 'a')
for x in content_list: file_append.write(x)
file_append.close()
main()