1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
"""Comparison tools for Objectapp
Based on clustered_models app"""
from math import sqrt
from objectapp.settings import F_MIN
from objectapp.settings import F_MAX
def pearson_score(list1, list2):
"""Compute the pearson score between 2 lists of vectors"""
sum1 = sum(list1)
sum2 = sum(list2)
sum_sq1 = sum([pow(l, 2) for l in list1])
sum_sq2 = sum([pow(l, 2) for l in list2])
prod_sum = sum([list1[i] * list2[i] for i in range(len(list1))])
num = prod_sum - (sum1 * sum2 / len(list1))
den = sqrt((sum_sq1 - pow(sum1, 2) / len(list1)) *
(sum_sq2 - pow(sum2, 2) / len(list2)))
if den == 0:
return 0.0
return 1.0 - num / den
class ClusteredModel(object):
"""Wrapper around Model class
building a dataset of instances"""
def __init__(self, queryset, fields=['id']):
self.fields = fields
self.queryset = queryset
def dataset(self):
"""Generate a dataset with the queryset
and specified fields"""
dataset = {}
for item in self.queryset.filter():
dataset[item] = ' '.join([unicode(item.__dict__[field])
for field in self.fields])
return dataset
class VectorBuilder(object):
"""Build a list of vectors based on datasets"""
def __init__(self, queryset, fields):
self.key = ''
self.columns = []
self.dataset = {}
self.clustered_model = ClusteredModel(queryset, fields)
self.build_dataset()
def build_dataset(self):
"""Generate whole dataset"""
data = {}
words_total = {}
model_data = self.clustered_model.dataset()
for instance, words in model_data.items():
words_item_total = {}
for word in words.split():
words_total.setdefault(word, 0)
words_item_total.setdefault(word, 0)
words_total[word] += 1
words_item_total[word] += 1
data[instance] = words_item_total
top_words = []
for word, count in words_total.items():
frequency = float(count) / len(data)
if frequency > F_MIN and frequency < F_MAX:
top_words.append(word)
self.dataset = {}
self.columns = top_words
for instance in data.keys():
self.dataset[instance] = [data[instance].get(word, 0)
for word in top_words]
self.key = self.generate_key()
def generate_key(self):
"""Generate key for this list of vectors"""
return self.clustered_model.queryset.count()
def flush(self):
"""Flush the dataset"""
if self.key != self.generate_key():
self.build_dataset()
def __call__(self):
self.flush()
return self.columns, self.dataset
|