ib.py

import csv
import random
import string
import numpy as np
#build the people's scores
def indexbuilder(weight,movie_file,index_file):
	#the path for the input file here
	f=open(movie_file,'rb')
	csv_file = csv.DictReader(f, delimiter=',',quotechar='"')
	index = {}
	for line in csv_file:
		people = []
		mark = float(line['score'])

		directors_str = line['directors'][1:-1]
		directors = directors_str.split(", ")
		for person in directors:
			people.append([person.strip()[1:-1],weight[0]])

		writers_str = line['writers'][1:-1]
		writers = set(writers_str.split(", "))
		for person in writers:
			people.append([person.strip()[1:-1],weight[1]])

		actors_str = line['actors'][1:-1]
		actors = actors_str.split(", ")
		for i in range(len(actors)):
			if (i<2):
				people.append([actors[i].strip()[1:-1],weight[2]])
			elif i<4:
				people.append([actors[i].strip()[1:-1],weight[3]])
			else:
				people.append([actors[i].strip()[1:-1],weight[4]])

		for person in people:
			key = person[0]
			if ('more credit' in key):
				continue
			if ('unknown' in key):
				continue
			if (not index.has_key(key)):
				index.update({key:[mark,person[1]]})
			else:
				old_mark = index[key][0]
				old_weight = index[key][1]
				new_mark = (old_mark*old_weight+person[1]*mark)/(old_weight+person[1])
				new_weight = old_weight+person[1]
				index[key]=[new_mark,new_weight]
	f.close()
	return index


#Transform the movies' feature from people's name to the mean value of different kinds of people
#(directors,writers,main actors,secondary actors,the rest actors)
def vectorbuilder(test_file,ppl):
	f=open(test_file,'rb')
	csv_file = csv.DictReader(f, delimiter=',',quotechar='"')
	movie={}
	#generate movie's vector
	for line in csv_file:
		directors_str = line['directors'][1:-1]
		directors=[]
		for s in directors_str.split(", "):
			if not ('more credit' in s[1:-1]):
				if ppl.has_key(s[1:-1]):
					directors.append( string.atof( ppl[s[1:-1]][0]) )#if the person in known, give its value from ppl
				else:
					directors.append(random.gauss(5.9,0.764323))# if the person is unknown, give it a random value generated by normal distribution
		writers_str = line['writers'][1:-1]
		writers=[]
		for s in writers_str.split(", "):
			if not ('more credit' in s[1:-1]):
				if ppl.has_key(s[1:-1]):
					writers.append( string.atof(ppl[s[1:-1]][0]) )
				else:
					writers.append(random.gauss(5.9,0.764323))

		actors_str = line['actors'][1:-1]
		actors=[]
		for s in actors_str.split(", "):
			if not ('more credit' in s[1:-1]):
				if ppl.has_key(s[1:-1]):
						actors.append(string.atof(ppl[s[1:-1]][0]))
				else:
						actors.append(random.gauss(5.9,0.764323))
		director=np.mean(np.array(directors))
		writer=np.mean(np.array(writers))
		actor12=0
		actor34=0
		n12=0
		n34=0
		ao=0
		nao=0
		for i in range(len(actors)):
			if actors[i]!=0:
				if (i<2):
					actor12+=actors[i]
					n12+=1
				elif i<4:
					actor34+=actors[i]
					n34+=1
				else:
					ao+=actors[i]
					nao+=1
		if n12!=0:
			actor12=actor12/n12
		if n34!=0:
			actor34=actor34/n34
		if nao!=0:
			ao=ao/nao
		movie.update( {line['imdb_id']:[director,writer,actor12,actor34,ao,string.atof(line['score'])]} )
#	print movie
#	write_movie(movie)
	return movie

def write_index(index,index_file="index.csv"):
	f = open(index_file,'w')
	f.write('Name,Score,Weight\n')
	for key in index.keys():
		f.write(key+','+str(index[key][0])+','+str(index[key][1])+'\n')
	f.close()

#def write_movie(movie,movie_file="movie.csv"):
#	f = open(movie_file,'w')
#	f.write('id,director,writer,actor\n')
#	for key in movie.keys():
#		f.write(key+','+str(movie[key][0])+','+str(movie[key][1])+','+str(movie[key][2])+'\n')
#	f.close()