summaryrefslogtreecommitdiff
path: root/objectapp/comparison.py
diff options
context:
space:
mode:
Diffstat (limited to 'objectapp/comparison.py')
-rw-r--r--objectapp/comparison.py93
1 files changed, 93 insertions, 0 deletions
diff --git a/objectapp/comparison.py b/objectapp/comparison.py
new file mode 100644
index 0000000..d79d78d
--- /dev/null
+++ b/objectapp/comparison.py
@@ -0,0 +1,93 @@
+"""Comparison tools for Objectapp
+Based on clustered_models app"""
+from math import sqrt
+
+from objectapp.settings import F_MIN
+from objectapp.settings import F_MAX
+
+
+def pearson_score(list1, list2):
+ """Compute the pearson score between 2 lists of vectors"""
+ sum1 = sum(list1)
+ sum2 = sum(list2)
+ sum_sq1 = sum([pow(l, 2) for l in list1])
+ sum_sq2 = sum([pow(l, 2) for l in list2])
+
+ prod_sum = sum([list1[i] * list2[i] for i in range(len(list1))])
+
+ num = prod_sum - (sum1 * sum2 / len(list1))
+ den = sqrt((sum_sq1 - pow(sum1, 2) / len(list1)) *
+ (sum_sq2 - pow(sum2, 2) / len(list2)))
+ if den == 0:
+ return 0.0
+ return 1.0 - num / den
+
+
+class ClusteredModel(object):
+ """Wrapper around Model class
+ building a dataset of instances"""
+
+ def __init__(self, queryset, fields=['id']):
+ self.fields = fields
+ self.queryset = queryset
+
+ def dataset(self):
+ """Generate a dataset with the queryset
+ and specified fields"""
+ dataset = {}
+ for item in self.queryset.filter():
+ dataset[item] = ' '.join([unicode(item.__dict__[field])
+ for field in self.fields])
+ return dataset
+
+
+class VectorBuilder(object):
+ """Build a list of vectors based on datasets"""
+
+ def __init__(self, queryset, fields):
+ self.key = ''
+ self.columns = []
+ self.dataset = {}
+ self.clustered_model = ClusteredModel(queryset, fields)
+ self.build_dataset()
+
+ def build_dataset(self):
+ """Generate whole dataset"""
+ data = {}
+ words_total = {}
+
+ model_data = self.clustered_model.dataset()
+ for instance, words in model_data.items():
+ words_item_total = {}
+ for word in words.split():
+ words_total.setdefault(word, 0)
+ words_item_total.setdefault(word, 0)
+ words_total[word] += 1
+ words_item_total[word] += 1
+ data[instance] = words_item_total
+
+ top_words = []
+ for word, count in words_total.items():
+ frequency = float(count) / len(data)
+ if frequency > F_MIN and frequency < F_MAX:
+ top_words.append(word)
+
+ self.dataset = {}
+ self.columns = top_words
+ for instance in data.keys():
+ self.dataset[instance] = [data[instance].get(word, 0)
+ for word in top_words]
+ self.key = self.generate_key()
+
+ def generate_key(self):
+ """Generate key for this list of vectors"""
+ return self.clustered_model.queryset.count()
+
+ def flush(self):
+ """Flush the dataset"""
+ if self.key != self.generate_key():
+ self.build_dataset()
+
+ def __call__(self):
+ self.flush()
+ return self.columns, self.dataset