NAIS

gzz1996 · Mar 1, 2019 · 14cdf91 · 14cdf91
1 parent 58965de
commit 14cdf91
Show file tree

Hide file tree

Showing 7 changed files with 1,145 additions and 249 deletions.
diff --git a/.gitignore b/.gitignore
@@ -49,6 +49,9 @@ test_data/
 result/
 export_ptr_model/
 data/
+Data/
+Log/
+log/
 __pycache__/
 
 log/

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/recommendation/Basic-NAIS-Demo/Batch_gen.py b/recommendation/Basic-NAIS-Demo/Batch_gen.py
@@ -0,0 +1,185 @@
+import multiprocessing
+import numpy as np
+
+_Dataset = None
+_batch_size = None
+_num_negatives = None
+_num_items = None
+_user_input = None
+_item_input = None
+_labels = None
+_index = None
+_num_batch = None
+_batch_length = None
+
+
+def shuffle(dataset, batch_choice, num_negatives):  # negative sampling and shuffle the data
+
+    global _Dataset
+    global _batch_size
+    global _num_negatives
+    global _num_items
+    global _user_input
+    global _item_input
+    global _labels
+    global _index
+    global _num_batch
+    global _batch_length
+    _Dataset = dataset
+    _num_negatives = num_negatives
+
+    if batch_choice == 'user':
+        _num_items, _user_input, _item_input, _labels, _batch_length = _get_train_data_user()
+        _num_batch = len(_batch_length)
+        return _preprocess(_get_train_batch_user)
+
+    else:
+        batch_choices = batch_choice.split(":")
+        if batch_choices[0] == 'fixed':
+            _batch_size = int(batch_choices[1])
+            _num_items, _user_input, _item_input, _labels = _get_train_data_fixed()
+            iterations = len(_user_input)
+            _index = np.arange(iterations)
+            _num_batch = iterations / _batch_size
+            return _preprocess(_get_train_batch_fixed)
+        else:
+            print("invalid batch size !")
+
+
+def batch_gen(batches, i):
+    return [(batches[r])[i] for r in range(4)]
+
+
+def _preprocess(get_train_batch):  # generate the masked batch list
+    user_input_list, num_idx_list, item_input_list, labels_list = [], [], [], []
+    cpu_count = multiprocessing.cpu_count()
+    if cpu_count == 1:
+        for i in range(_num_batch):
+            ui, ni, ii, l = get_train_batch(i)
+            user_input_list.append(ui)
+            num_idx_list.append(ni)
+            item_input_list.append(ii)
+            labels_list.append(l)
+    else:
+        pool = multiprocessing.Pool(cpu_count)
+        res = pool.map(get_train_batch, list(range(_num_batch)))
+        pool.close()
+        pool.join()
+        user_input_list = [r[0] for r in res]
+        num_idx_list = [r[1] for r in res]
+        item_input_list = [r[2] for r in res]
+        labels_list = [r[3] for r in res]
+    return (user_input_list, num_idx_list, item_input_list, labels_list)
+
+
+def _get_train_data_user():
+    user_input, item_input, labels, batch_length = [], [], [], []
+    train = _Dataset.trainMatrix
+    trainList = _Dataset.trainList
+    num_items = train.shape[1]
+    num_users = train.shape[0]
+    for u in range(num_users):
+        if u == 0:
+            batch_length.append((1 + _num_negatives) * len(trainList[u]))
+        else:
+            batch_length.append((1 + _num_negatives) * len(trainList[u]) + batch_length[u - 1])
+        for i in trainList[u]:
+            # positive instance
+            user_input.append(u)
+            item_input.append(i)
+            labels.append(1)
+            # negative instances
+            for t in range(_num_negatives):
+                j = np.random.randint(num_items)
+                while j in trainList[u]:
+                    j = np.random.randint(num_items)
+                user_input.append(u)
+                item_input.append(j)
+                labels.append(0)
+    return num_items, user_input, item_input, labels, batch_length
+
+
+def _get_train_batch_user(i):
+    # represent the feature of users via items rated by him/her
+    user_list, num_list, item_list, labels_list = [], [], [], []
+    trainList = _Dataset.trainList
+    if i == 0:
+        begin = 0
+    else:
+        begin = _batch_length[i - 1]
+    batch_index = list(range(begin, _batch_length[i]))
+    np.random.shuffle(batch_index)
+    for idx in batch_index:
+        user_idx = _user_input[idx]
+        item_idx = _item_input[idx]
+        nonzero_row = []
+        nonzero_row += trainList[user_idx]
+        num_list.append(_remove_item(_num_items, nonzero_row, item_idx))
+        user_list.append(nonzero_row)
+        item_list.append(item_idx)
+        labels_list.append(_labels[idx])
+    user_input = np.array(_add_mask(_num_items, user_list, max(num_list)))
+    num_idx = np.array(num_list)
+    item_input = np.array(item_list)
+    labels = np.array(labels_list)
+    return (user_input, num_idx, item_input, labels)
+
+
+def _get_train_data_fixed():
+    user_input, item_input, labels = [], [], []
+    train = _Dataset.trainMatrix
+    num_items = train.shape[1]
+    for (u, i) in train.keys():
+        # positive instance
+        user_items = []
+        user_input.append(u)
+        item_input.append(i)
+        labels.append(1)
+        # negative instances
+        for t in range(_num_negatives):
+            j = np.random.randint(num_items)
+            while train.has_key((u, j)):
+                j = np.random.randint(num_items)
+            user_input.append(u)
+            item_input.append(j)
+            labels.append(0)
+    return num_items, user_input, item_input, labels
+
+
+def _get_train_batch_fixed(i):
+    # represent the feature of users via items rated by him/her
+    user_list, num_list, item_list, labels_list = [], [], [], []
+    trainList = _Dataset.trainList
+    begin = i * _batch_size
+    for idx in range(begin, begin + _batch_size):
+        user_idx = _user_input[_index[idx]]
+        item_idx = _item_input[_index[idx]]
+        nonzero_row = []
+        nonzero_row += trainList[user_idx]
+        num_list.append(_remove_item(_num_items, nonzero_row, item_idx))
+        user_list.append(nonzero_row)
+        item_list.append(item_idx)
+        labels_list.append(_labels[_index[idx]])
+    user_input = np.array(_add_mask(_num_items, user_list, max(num_list)))
+    num_idx = np.array(num_list)
+    item_input = np.array(item_list)
+    labels = np.array(labels_list)
+    return (user_input, num_idx, item_input, labels)
+
+
+def _remove_item(feature_mask, users, item):
+    flag = 0
+    for i in range(len(users)):
+        if users[i] == item:
+            users[i] = users[-1]
+            users[-1] = feature_mask
+            flag = 1
+            break
+    return len(users) - flag
+
+
+def _add_mask(feature_mask, features, num_max):
+    # uniformalize the length of each batch
+    for i in range(len(features)):
+        features[i] = features[i] + [feature_mask] * (num_max + 1 - len(features[i]))
+    return features
diff --git a/recommendation/Basic-NAIS-Demo/Dataset.py b/recommendation/Basic-NAIS-Demo/Dataset.py
@@ -0,0 +1,96 @@
+import scipy.sparse as sp
+import numpy as np
+
+ITEM_CLIP = 300
+
+class Dataset(object):
+
+    def __init__(self,path):
+        self.trainMatrix = self.load_training_file_as_matrix(path + ".train.rating")
+        self.trainList = self.load_training_file_as_list(path + ".train.rating")
+        self.testRatings = self.load_rating_file_as_list(path + '.test.rating')
+        self.testNegatives = self.load_negative_file(path + ".test.negative")
+        assert len(self.testRatings) == len(self.testNegatives)
+        self.num_users, self.num_items = self.trainMatrix.shape
+
+
+
+    def load_negative_file(self,filename):
+        negativeList = []
+        with open(filename, "r") as f:
+            line = f.readline()
+            while line != None and line != "": # 一行是一个用户所有的neg
+                arr = line.split("\t")
+                negatives = []
+                for x in arr[1:]:
+                    negatives.append(int(x))
+                negativeList.append(negatives)
+                line = f.readline()
+        return negativeList
+
+    def load_rating_file_as_list(self,filename):
+        ratingList = []
+        with open(filename, "r") as f:
+            line = f.readline()
+            while line != None and line != "":
+                arr = line.split("\t")
+                user, item = int(arr[0]), int(arr[1])
+                ratingList.append([user, item])
+                line = f.readline()
+        return ratingList
+
+
+
+    def load_training_file_as_list(self,filename):
+        u_ = 0
+        lists, items = [], [] # 训练数据是按用户id排序过的
+        with open(filename, "r") as f:
+            line = f.readline()
+            index = 0
+            while line != None and line != "":
+                arr = line.split("\t")
+                u, i = int(arr[0]), int(arr[1])
+                if u_ < u:
+                    index = 0
+                    lists.append(items) # 每次的items是一个用户所有打过分的item
+                    items = []
+                    u_ += 1
+                index += 1
+                if index < ITEM_CLIP:
+                    items.append(i)
+                line = f.readline()
+        lists.append(items)
+        print("already load the trainList...")
+        return lists
+
+
+
+
+    def load_training_file_as_matrix(self,filename):
+
+        num_users,num_items = 0,0
+        with open(filename,"r") as f:
+            line = f.readline()
+            while line != None and line != "":
+                arr = line.split("\t")
+
+                u,i = int(arr[0]),int(arr[1])
+
+                num_users = max(num_users,u)
+                num_items = max(num_items,i)
+                line = f.readline()
+
+
+        mat = sp.dok_matrix((num_users+1,num_items+1),dtype=np.float32)
+        with open(filename,"r") as f:
+            line = f.readline()
+            while line != None and line != "":
+                arr = line.split("\t")
+                user,item,rating = int(arr[0]),int(arr[1]),float(arr[2])
+
+                if rating > 0:
+                    mat[user,item] = 1.0
+                line = f.readline()
+
+        print("already load the trainMatrix...")
+        return mat
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,6 +49,9 @@ test_data/ @@
     result/
     export_ptr_model/
     data/
+    Data/
+    Log/
+    log/
     __pycache__/
     log/
@@ Expand Down @@