-
Notifications
You must be signed in to change notification settings - Fork 2
/
dbs.py
81 lines (62 loc) · 3.06 KB
/
dbs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from sklearn.cluster import DBSCAN
from sklearn import metrics
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import sqlite3
import json, requests
def cluster():
connection = sqlite3.connect("safai.db")
cursor = connection.cursor()
cursor.execute('SELECT lattitude, longitude from report')
res = cursor.fetchall()
connection.commit()
# define the number of kilometers in one radian
kms_per_radian = 6371.0088
# df = pd.read_csv('summer-travel-gps-full.csv', encoding='utf-8')
# df.head()
# # represent points consistently as (lat, lon)
# coords = df.as_matrix(columns=['lat', 'lon'])
# load the data set
temp_mat=[]
# listofc=[]
for x in res:
temp_mat.append([x[0],x[1]])
coords=np.array(temp_mat)
# define epsilon as 1.5 kilometers, converted to radians for use by haversine
epsilon = 5/ kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=3, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
# get the number of clusters
num_clusters = len(set(cluster_labels))
# all done, print the outcome
# message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds'
# print(message.format(len(df), num_clusters, 100*(1 - float(num_clusters) / len(df)), time.time()-start_time))
# print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(coords, cluster_labels)))
# turn the clusters in to a pandas series, where each element is a cluster of points
clusters = pd.Series([coords[cluster_labels==n] for n in range(num_clusters)])
def get_centermost_point(cluster):
centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
return (lats,lons)
#print(lons)
# from these lats/lons create a new df of one representative point for each cluster
# rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
# rep_points.tail()
# # pull row from original data set where lat/lon match the lat/lon of each row of representative points
# # that way we get the full details like city, country, and date from the original dataframe
# rs = rep_points.apply(lambda row: df[(df['lat']==row['lat']) & (df['lon']==row['lon'])].iloc[0], axis=1)
# rs.to_csv('summer-travel-gps-dbscan.csv', encoding='utf-8')
# rs.tail()
# # plot the final reduced set of coordinate points vs the original full set
# fig, ax = plt.subplots(figsize=[10, 6])
# rs_scatter = ax.scatter(rs['lon'], rs['lat'], c='#99cc99', edgecolor='None', alpha=0.7, s=120)
# df_scatter = ax.scatter(df['lon'], df['lat'], c='k', alpha=0.9, s=3)
# ax.set_title('Full data set vs DBSCAN reduced set')
# ax.set_xlabel('Longitude')
# ax.set_ylabel('Latitude')
# ax.legend([df_scatter, rs_scatter], ['Full set', 'Reduced set'], loc='upper right')
# plt.show()