-
Notifications
You must be signed in to change notification settings - Fork 0
/
EdmundsCrawler.java
127 lines (102 loc) · 3.04 KB
/
EdmundsCrawler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Crawler for Edmunds.com.
* @author namangupta
*
*/
public class EdmundsCrawler {
String title;
String vehicle;
String date;
String reviewText;
Review review = null;
List<String> fileNames;
final static String urlFolder = "data/urls";
final static String outputFolder ="data/Reviews/";
public EdmundsCrawler() {
fileNames= new ArrayList<String>();
}
private void getData(Document doc, PrintWriter pw) throws Exception {
Elements links = doc.select("a");
for (Element link : links) {
String attribute = link.attr("class");
if (attribute.equalsIgnoreCase("header-3")) {
Document doc1 = Jsoup.connect(link.attr("abs:href"))
.timeout(15 * 1000).get();
this.title = doc1
.getElementsByAttributeValue("class", "header-3")
.text().trim();
this.vehicle = doc1
.getElementsByAttributeValue("itemprop", "itemreviewed")
.text().trim();
this.reviewText = doc1
.getElementsByAttributeValue("itemprop", "description")
.text().trim();
this.date = doc1
.getElementsByAttributeValue("itemprop", "dtreviewed")
.text().trim();
review = new Review(title, vehicle, reviewText, date.split(" ")[0]);
System.out.println(review);
pw.println(review);
pw.flush();
}
}
}
private void crawlUrlList() throws Exception {
for(String urlFile : fileNames)
{
//Skipping Temporary Files
if(urlFile.endsWith("~"))
continue;
BufferedReader reader = new BufferedReader(new FileReader(urlFolder+"//"+urlFile));
PrintWriter writer = new PrintWriter(outputFolder+urlFile+".txt");
String url;
while ((url = reader.readLine()) != null)
{
//Continuous Hit may be detected as bot. So hold for a while and then hit again.
Thread.sleep(5000);
url = url.trim();
if (url.length() == 0)
continue;
try
{
Document doc = Jsoup.connect(url).timeout(15 * 1000).get();
getData(doc, writer);
}
catch (Exception e)
{
e.printStackTrace();
}
}
writer.close();
reader.close();
}
}
public void listFilesForFolder(File folder) {
for (final File fileEntry : folder.listFiles()) {
if (fileEntry.isDirectory()) {
listFilesForFolder(fileEntry);
} else {
fileNames.add(fileEntry.getName());
}
}
}
public static void main(String args[]) throws Exception {
/*
* System.setProperty("http.proxyHost", "****");
* System.setProperty("http.proxyPort", "****");
*/
EdmundsCrawler crawler = new EdmundsCrawler();
crawler.listFilesForFolder(new File(urlFolder));
crawler.crawlUrlList();
}
}