-
Notifications
You must be signed in to change notification settings - Fork 2
/
import.rb
68 lines (61 loc) · 1.75 KB
/
import.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env ruby
require 'rubygems'
require './extractcontent'
require './tokenize'
require 'open-uri'
# url = "http://www.ruby-lang.org/en/"
url = ARGV[0]
html = open(url).read.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => ' ')
original_text, title = ExtractContent::analyse(html)
if original_text
original_text = original_text.split(/\r*\n+/)
else
original_text = html
end
tokenizer = TextTokenizer.new
doc = tokenizer.tokenize(original_text, title)
doc[:url] = url
require 'json'
puts JSON.generate(doc)
=begin
doc_id = @docs_coll.insert({:url => url, :title => title})
doc[:sentences].each do |s|
sentence_id = s[:_id] = BSON::ObjectId.new
word_keys = []
s[:tokens].each do |t|
word_key = [t[:part_of_speech][0],t[:base_form]].join(':')
unless stop_words.include?(t[:base_form]) || word_keys.include?(word_key)
exists_word = @words_coll.find_one(:word_key => word_key)
word_id = if exists_word
exists_word['_id']
else
@words_coll.insert({:word_key => word_key})
end
token_id = @tokens_coll.insert({
:word => word_id,
:base_form => t[:base_form],
:part_of_speech => t[:part_of_speech],
:sentence => sentence_id,
:sentence_token_offset => t[:token_begin],
:sentence_token_length => t[:token_length]
})
@word_counters_coll.insert({
:word => word_id,
:base_form => t[:base_form],
:part_of_speech => t[:part_of_speech],
:tokens => [],
:counter => 0
})
@word_counters_coll.update({
:word => word_id
},
{
'$inc' => {:counter => 1},
'$push' => {:tokens => token_id}
})
word_keys << word_key
end
end
@sentences_coll.insert s
end
=end