-
Notifications
You must be signed in to change notification settings - Fork 2
/
dblp2graph.py
executable file
·152 lines (136 loc) · 4.54 KB
/
dblp2graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#! /usr/bin/python2.7
#
# This script extracts a graph from the DBLP XML
# Nodes - authors
# Edges - between any two co-authors
#
# dblp2graph.py
# [-i <input-dir>]
# [-o <output-dir>]
# [-s <start-date>]
# [-e <end-date>]
# [--core <k>]
#
# -i <input-dir> Directory for dblp xml files (default: ./xml)
# -o <output-dir> Directory for nodes and edges files (default: ./)
# -s <start-date> Only output edges & nodes from start-date (default: None)
# -e <end-date> Only output edges & nodes until end-date (default: None)
# --core <k> Only output nodes with <k> papers (default: 1)
#
# example:
# dblp2graph.py -o ./graph -s 2000-01-01 -e 2005-01-01 --core 3
# NOTE: This is a memory intensive script, approximately O(n)
# Make sure you can fit the XML in memory
import argparse
import datetime
import itertools
# lxml can load elements from DTD file
from lxml import etree
import os
DEBUG=False
INPUT_XML = "dblp.xml"
INPUT_DTD = "dblp.dtd"
OUTPUT_NODES = "nodes.csv"
OUTPUT_EDGES = "edges.csv"
ISO_DATE_FORMAT = '%Y-%m-%d'
PAPER_TYPES = set(["article","inproceedings","proceedings","book",
"incollection","phdthesis","mastersthesis","www"])
PAPER_DATE_ATT = 'mdate'
PAPER_AUTHOR_ELEM = 'author'
# dast:
# dict of author-name:publication-count
nodes = {}
# set of co-author sets (using hash of author-name)
edges = set()
# This will enable O(n) memory footprint,
# and allow to filter out the core.
def debug(msg=None):
if DEBUG:
print("DEBUG: ", msg)
# Checks if this is a valid paper
def isValidPaper(e, startDate=None, endDate=None):
if not e.tag in PAPER_TYPES:
return False
# check for mdate attribute
datestr = elem.get(PAPER_DATE_ATT)
if not datestr:
elem.clear()
return False
mdate = datetime.datetime.strptime(datestr, ISO_DATE_FORMAT)
# validate startDate/endDate
if (startDate and startDate > mdate) or (endDate and endDate < mdate):
elem.clear()
return False
return True
# parse commandline arguments
parser = argparse.ArgumentParser(
description='Extract a graph from the DBLP XML')
parser.add_argument('--core', dest='core', default=1, type=int,
help='Only output nodes with CORE papers')
parser.add_argument('-i', dest='inputDir', default='./xml',
help='Directory for dblp xml files')
parser.add_argument('-o', dest='outputDir', default='./',
help='Directory for nodes and edges files')
parser.add_argument('-s', dest='startDate', default=None,
help='Only output edges & nodes from STARTDATE')
parser.add_argument('-e', dest='endDate', default=None,
help='Only output edges & nodes until ENDDATE')
args = parser.parse_args()
if args.startDate:
args.startDate = datetime.datetime.strptime(args.startDate, ISO_DATE_FORMAT)
if args.endDate:
args.endDate = datetime.datetime.strptime(args.endDate, ISO_DATE_FORMAT)
# make sure we have dblp.xml, dblp.dtd exists
inputFiles = [ os.path.join(args.inputDir, INPUT_XML),
os.path.join(args.inputDir, INPUT_DTD) ]
for f in inputFiles:
if not os.path.isfile(f):
print("error: unable to find %s" % f)
parser.print_help()
exit(1)
context = etree.iterparse(
os.path.join(args.inputDir, INPUT_XML),
load_dtd=True)
debug('reading XML into data stracture')
uid = 0
for action, elem in context:
if isValidPaper(elem, args.startDate, args.endDate):
authors = [e for e in elem.getchildren() if e.tag == PAPER_AUTHOR_ELEM]
if len(authors) > 1:
# extract nodes
authors_ids = set()
#TODO: using python hash might expose the edges to corruption, use incremental index
for author in authors:
# remove commas and other non-supported characters for
# a valid ASCII CSV output
name = author.text.replace(',','').encode('utf-8').decode('ascii','ignore')
if not nodes.has_key(name):
nodes[name] = (uid, 0)
uid += 1
nodes[name] = (nodes[name][0], nodes[name][1]+1)
authors_ids.add(nodes[name][0])
# extract edges
for e in itertools.combinations(authors_ids,2 ):
edges.add(tuple(sorted(e))) # add (x,y)
edges.add(tuple(sorted(e,reverse=True))) # add (y,x)
# Free the RAM
elem.clear()
# write data
if not os.path.exists(args.outputDir):
os.makedirs(args.outputDir)
# write nodes
debug('writing graph nodes')
nodesFile = open(os.path.join(args.outputDir, OUTPUT_NODES),'w')
nodesByID = {}
for k in nodes:
if nodes[k][1] >= args.core:
nodesFile.write('%s,%s\n' % (nodes[k][0],k))
nodesByID[nodes[k][0]] = k
nodesFile.close()
# write edges
debug('writing graph edges')
edgesFile = open(os.path.join(args.outputDir, OUTPUT_EDGES),'w')
for e in edges:
if (e[0] in nodesByID) and (e[1] in nodesByID):
edgesFile.write('%s,%s\n' % e)
edgesFile.close()