Showing
4 changed files
with
420 additions
and
0 deletions
코드_데이터/collaborative_filtering_aftask_v1.py
0 → 100644
1 | +#!/usr/bin/env python | ||
2 | +# coding: utf-8 | ||
3 | + | ||
4 | +# In[ ]: | ||
5 | + | ||
6 | + | ||
7 | +import pandas as pd | ||
8 | +import numpy as np | ||
9 | +import tensorflow as tf | ||
10 | +from tensorflow import keras | ||
11 | +from tensorflow.keras import layers | ||
12 | +from pathlib import Path | ||
13 | +import matplotlib.pyplot as plt | ||
14 | + | ||
15 | +dataset_file = open("dataset.csv",'r') | ||
16 | +df = pd.read_csv(dataset_file) | ||
17 | + | ||
18 | +user_ids = df["userid"].unique().tolist() | ||
19 | +user2user_encoded = {x: i for i, x in enumerate(user_ids)} | ||
20 | +userencoded2user = {i: x for i, x in enumerate(user_ids)} | ||
21 | +contents_ids = df["contentsid"].unique().tolist() | ||
22 | +contents2contents_encoded = {x: i for i, x in enumerate(contents_ids)} | ||
23 | +contents_encoded2contents = {i: x for i, x in enumerate(contents_ids)} | ||
24 | +df["user"] = df["userid"].map(user2user_encoded) | ||
25 | +df["contents"] = df["contentsid"].map(contents2contents_encoded) | ||
26 | + | ||
27 | +num_users = len(user2user_encoded) | ||
28 | +num_contents = len(contents_encoded2contents) | ||
29 | +df["rating"] = df["rating"].values.astype(np.float32) | ||
30 | +# min and max ratings will be used to normalize the ratings later | ||
31 | +min_rating = 0.5 | ||
32 | +max_rating = 5.0 | ||
33 | + | ||
34 | +print( | ||
35 | + "Number of users: {}, Number of Contents: {}, Min rating: {}, Max rating: {}".format( | ||
36 | + num_users, num_contents, min_rating, max_rating | ||
37 | + ) | ||
38 | +) | ||
39 | + | ||
40 | +df = df.sample(frac=1, random_state=42) | ||
41 | +x = df[["user", "contents"]].values | ||
42 | +# Normalize the targets between 0 and 1. Makes it easy to train. | ||
43 | +y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values | ||
44 | +# Assuming training on 90% of the data and validating on 10%. | ||
45 | +train_indices = int(0.9 * df.shape[0]) | ||
46 | +x_train, x_val, y_train, y_val = ( | ||
47 | + x[:train_indices], | ||
48 | + x[train_indices:], | ||
49 | + y[:train_indices], | ||
50 | + y[train_indices:], | ||
51 | +) | ||
52 | + | ||
53 | +EMBEDDING_SIZE = 50 | ||
54 | + | ||
55 | + | ||
56 | +class RecommenderNet(keras.Model): | ||
57 | + def __init__(self, num_users, num_contents, embedding_size, **kwargs): | ||
58 | + super(RecommenderNet, self).__init__(**kwargs) | ||
59 | + self.num_users = num_users | ||
60 | + self.num_contents = num_contents | ||
61 | + self.embedding_size = embedding_size | ||
62 | + self.user_embedding = layers.Embedding( | ||
63 | + num_users, | ||
64 | + embedding_size, | ||
65 | + embeddings_initializer="he_normal", | ||
66 | + embeddings_regularizer=keras.regularizers.l2(1e-6), | ||
67 | + ) | ||
68 | + self.user_bias = layers.Embedding(num_users, 1) | ||
69 | + self.contents_embedding = layers.Embedding( | ||
70 | + num_contents, | ||
71 | + embedding_size, | ||
72 | + embeddings_initializer="he_normal", | ||
73 | + embeddings_regularizer=keras.regularizers.l2(1e-6), | ||
74 | + ) | ||
75 | + self.contents_bias = layers.Embedding(num_contents, 1) | ||
76 | + | ||
77 | + def call(self, inputs): | ||
78 | + user_vector = self.user_embedding(inputs[:, 0]) | ||
79 | + user_bias = self.user_bias(inputs[:, 0]) | ||
80 | + contents_vector = self.contents_embedding(inputs[:, 1]) | ||
81 | + contents_bias = self.contents_bias(inputs[:, 1]) | ||
82 | + dot_user_contents = tf.tensordot(user_vector, contents_vector, 2) | ||
83 | + # Add all the components (including bias) | ||
84 | + x = dot_user_contents + user_bias + contents_bias | ||
85 | + # The sigmoid activation forces the rating to between 0 and 1 | ||
86 | + return tf.nn.sigmoid(x) | ||
87 | + | ||
88 | + | ||
89 | +model = RecommenderNet(num_users, num_contents, EMBEDDING_SIZE) | ||
90 | +model.compile( | ||
91 | + optimizer='sgd', | ||
92 | + loss='mse', | ||
93 | + metrics=[tf.keras.metrics.MeanSquaredError()]) | ||
94 | + | ||
95 | +history = model.fit( | ||
96 | + x=x_train, | ||
97 | + y=y_train, | ||
98 | + batch_size=2, | ||
99 | + epochs=20, | ||
100 | + verbose=1, | ||
101 | + validation_data=(x_val, y_val), | ||
102 | +) | ||
103 | + | ||
104 | +plt.plot(history.history["loss"]) | ||
105 | +plt.plot(history.history["val_loss"]) | ||
106 | +plt.title("model loss") | ||
107 | +plt.ylabel("loss") | ||
108 | +plt.xlabel("epoch") | ||
109 | +plt.legend(["train", "test"], loc="upper left") | ||
110 | +plt.show() | ||
111 | + | ||
112 | +test_file = open("dataset_test.csv",'r') | ||
113 | +tf = pd.read_csv(test_file) | ||
114 | + | ||
115 | +user_ids = tf["userid"].unique().tolist() | ||
116 | +user2user_encoded = {x: i for i, x in enumerate(user_ids)} | ||
117 | +userencoded2user = {i: x for i, x in enumerate(user_ids)} | ||
118 | +contents_ids = tf["contentsid"].unique().tolist() | ||
119 | +contents2contents_encoded = {x: i for i, x in enumerate(contents_ids)} | ||
120 | +contents_encoded2contents = {i: x for i, x in enumerate(contents_ids)} | ||
121 | +tf["user"] = tf["userid"].map(user2user_encoded) | ||
122 | +tf["contents"] = tf["contentsid"].map(contents2contents_encoded) | ||
123 | +tf["rating"] = tf["rating"].values.astype(np.float32) | ||
124 | + | ||
125 | +tf = tf.sample(frac=1, random_state=42) | ||
126 | +x = tf[["user", "contents"]].values | ||
127 | +y = tf["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values | ||
128 | + | ||
129 | +x_test, y_test = (x, y) | ||
130 | +result = model.evaluate(x_test, y_test) | ||
131 | +print(result) | ||
132 | + |
코드_데이터/collaborative_filtering_aftask_v2.py
0 → 100644
1 | +#!/usr/bin/env python | ||
2 | +# coding: utf-8 | ||
3 | + | ||
4 | +# In[ ]: | ||
5 | + | ||
6 | + | ||
7 | +import pandas as pd | ||
8 | +import numpy as np | ||
9 | +import tensorflow as tf | ||
10 | +from tensorflow import keras | ||
11 | +from tensorflow.keras import layers | ||
12 | +from pathlib import Path | ||
13 | +import matplotlib.pyplot as plt | ||
14 | + | ||
15 | +df_x = pd.read_csv("x_train.csv") | ||
16 | +df_y = pd.read_csv("y_train.csv") | ||
17 | +df = pd.concat([df_x, df_y], axis=1) | ||
18 | + | ||
19 | +user_ids = df["userid"].unique().tolist() | ||
20 | +user2user_encoded = {x: i for i, x in enumerate(user_ids)} | ||
21 | +userencoded2user = {i: x for i, x in enumerate(user_ids)} | ||
22 | +task_ids = df["taskid"].unique().tolist() | ||
23 | +task2task_encoded = {x: i for i, x in enumerate(task_ids)} | ||
24 | +task_encoded2task = {i: x for i, x in enumerate(task_ids)} | ||
25 | +df["user"] = df["userid"].map(user2user_encoded) | ||
26 | +df["task"] = df["taskid"].map(task2task_encoded) | ||
27 | + | ||
28 | +num_users = len(user2user_encoded) | ||
29 | +num_task = len(task_encoded2task) | ||
30 | +df["rating"] = df["rating"].values.astype(np.float32) | ||
31 | +# min and max ratings will be used to normalize the ratings later | ||
32 | +MIN_RATING = 0.5 | ||
33 | +MAX_RATING = 5.0 | ||
34 | + | ||
35 | +print( | ||
36 | + "Number of users: {}, Number of task: {}, Min rating: {}, Max rating: {}".format( | ||
37 | + num_users, num_task, MIN_RATING, MAX_RATING | ||
38 | + ) | ||
39 | +) | ||
40 | + | ||
41 | +df = df.sample(frac=1, random_state=42) | ||
42 | +x = df[["user", "task"]].values | ||
43 | +# Normalize the targets between 0 and 1. Makes it easy to train. | ||
44 | +y = df["rating"].apply(lambda x: (x - MIN_RATING) / (MAX_RATING - MIN_RATING)).values | ||
45 | +# Assuming training on 90% of the data and validating on 10%. | ||
46 | +train_indices = int(0.9 * df.shape[0]) | ||
47 | +x_train, x_val, y_train, y_val = ( | ||
48 | + x[:train_indices], | ||
49 | + x[train_indices:], | ||
50 | + y[:train_indices], | ||
51 | + y[train_indices:], | ||
52 | +) | ||
53 | + | ||
54 | +EMBEDDING_SIZE = 128 | ||
55 | + | ||
56 | +class RecommenderNet(keras.Model): | ||
57 | + def __init__(self, num_users, num_task, embedding_size, **kwargs): | ||
58 | + super(RecommenderNet, self).__init__(**kwargs) | ||
59 | + self.num_users = num_users | ||
60 | + self.num_task = num_task | ||
61 | + self.embedding_size = embedding_size | ||
62 | + self.user_embedding = layers.Embedding( | ||
63 | + num_users, | ||
64 | + embedding_size, | ||
65 | + embeddings_initializer="he_normal", | ||
66 | + embeddings_regularizer=keras.regularizers.l2(1e-6), | ||
67 | + ) | ||
68 | + self.user_bias = layers.Embedding(num_users, 1) | ||
69 | + self.task_embedding = layers.Embedding( | ||
70 | + num_task, | ||
71 | + embedding_size, | ||
72 | + embeddings_initializer="he_normal", | ||
73 | + embeddings_regularizer=keras.regularizers.l2(1e-6), | ||
74 | + ) | ||
75 | + self.task_bias = layers.Embedding(num_task, 1) | ||
76 | + | ||
77 | + def call(self, inputs): | ||
78 | + user_vector = self.user_embedding(inputs[:, 0]) | ||
79 | + user_bias = self.user_bias(inputs[:, 0]) | ||
80 | + task_vector = self.task_embedding(inputs[:, 1]) | ||
81 | + task_bias = self.task_bias(inputs[:, 1]) | ||
82 | + dot_user_task = tf.tensordot(user_vector, task_vector, 2) | ||
83 | + # Add all the components (including bias) | ||
84 | + x = dot_user_task + user_bias + task_bias | ||
85 | + # The sigmoid activation forces the rating to between 0 and 1 | ||
86 | + return tf.nn.sigmoid(x) | ||
87 | + | ||
88 | + | ||
89 | +model = RecommenderNet(num_users, num_task, EMBEDDING_SIZE) | ||
90 | +model.compile( | ||
91 | + optimizer='adam', | ||
92 | + loss='mse', | ||
93 | + metrics=[tf.keras.metrics.MeanSquaredError()]) | ||
94 | + | ||
95 | +history = model.fit( | ||
96 | + x=x_train, | ||
97 | + y=y_train, | ||
98 | + batch_size=8, | ||
99 | + epochs=300, | ||
100 | + verbose=1, | ||
101 | + validation_data=(x_val, y_val), | ||
102 | +) | ||
103 | + | ||
104 | +df_x_test = pd.read_csv('x_test.csv') | ||
105 | + | ||
106 | +df_x_test["user"] = df_x_test["userid"].map(user2user_encoded) | ||
107 | +df_x_test["task"] = df_x_test["taskid"].map(task2task_encoded) | ||
108 | + | ||
109 | +x_test = df_x_test[["user", "task"]].values | ||
110 | + | ||
111 | +y_pred = model.predict(x_test) | ||
112 | + | ||
113 | +df_y_pred = pd.DataFrame(y_pred, columns=['rating']) | ||
114 | +df_y_pred = df_y_pred["rating"].apply(lambda x: (x * (MAX_RATING - MIN_RATING) + MIN_RATING )) | ||
115 | +df_y_pred.to_csv('y_pred.csv', sep=',', columns = ['rating'], index = False) | ||
116 | + | ||
117 | +#evaluate | ||
118 | +import os | ||
119 | +import sys | ||
120 | +import pandas as pd | ||
121 | +from sklearn.metrics import mean_squared_error | ||
122 | + | ||
123 | +gt = pd.read_csv('y_test.csv', header=0) | ||
124 | +pr = pd.read_csv('y_pred.csv', header=0) | ||
125 | + | ||
126 | +gt = gt.to_numpy().astype(float).reshape(-1) | ||
127 | +pr = pr.to_numpy().astype(float).reshape(-1) | ||
128 | + | ||
129 | +score = mean_squared_error(gt, pr, squared = False) | ||
130 | +print("score:", score) | ||
131 | + |
코드_데이터/dataset.csv
0 → 100644
This diff is collapsed. Click to expand it.
코드_데이터/dataset_test.csv
0 → 100644
1 | +userid,contentsid,rating | ||
2 | +1,T000043,5 | ||
3 | +1,T000055,0.5 | ||
4 | +1,T000072,0.5 | ||
5 | +1,T000064,5 | ||
6 | +1,T001630,0.5 | ||
7 | +1,T000308,0.5 | ||
8 | +1,T000293,0.5 | ||
9 | +1,T001616,0.5 | ||
10 | +1,T001613,5 | ||
11 | +1,T001601,0.5 | ||
12 | +1,T001919,0.5 | ||
13 | +1,T001946,5 | ||
14 | +2,T000046,5 | ||
15 | +2,T000051,0.5 | ||
16 | +2,T000074,0.5 | ||
17 | +2,T000308,0.5 | ||
18 | +2,T000307,5 | ||
19 | +2,T000299,0.5 | ||
20 | +2,T000291,0.5 | ||
21 | +2,T001613,5 | ||
22 | +2,T001607,0.5 | ||
23 | +2,T001920,5 | ||
24 | +2,T001916,0.5 | ||
25 | +2,T001943,0.5 | ||
26 | +3,T000036,5 | ||
27 | +3,T000049,0.5 | ||
28 | +3,T000053,0.5 | ||
29 | +3,T000061,5 | ||
30 | +3,T000073,0.5 | ||
31 | +3,T001628,5 | ||
32 | +3,T000302,5 | ||
33 | +3,T000212,0.5 | ||
34 | +3,T001616,0.5 | ||
35 | +3,T001606,0.5 | ||
36 | +3,T001920,0.5 | ||
37 | +3,T001915,0.5 | ||
38 | +4,T001947,0.5 | ||
39 | +4,T001921,0.5 | ||
40 | +4,T001617,0.5 | ||
41 | +4,T001606,0.5 | ||
42 | +4,T000040,0.5 | ||
43 | +4,T000045,0.5 | ||
44 | +4,T000060,5 | ||
45 | +4,T000077,0.5 | ||
46 | +4,T000068,0.5 | ||
47 | +4,T000302,0.5 | ||
48 | +4,T000293,0.5 | ||
49 | +4,T000288,0.5 | ||
50 | +5,T001956,5 | ||
51 | +5,T001915,0.5 | ||
52 | +5,T001611,5 | ||
53 | +5,T001604,5 | ||
54 | +5,T000046,0.5 | ||
55 | +5,T000056,5 | ||
56 | +5,T000073,0.5 | ||
57 | +5,T000065,0.5 | ||
58 | +5,T001630,0.5 | ||
59 | +5,T000309,0.5 | ||
60 | +5,T000299,5 | ||
61 | +5,T000294,0.5 | ||
62 | +6,T001943,5 | ||
63 | +6,T001911,0.5 | ||
64 | +6,T000036,0.5 | ||
65 | +6,T000050,5 | ||
66 | +6,T000056,0.5 | ||
67 | +6,T000059,5 | ||
68 | +6,T000074,5 | ||
69 | +6,T000072,0.5 | ||
70 | +6,T000071,0.5 | ||
71 | +6,T000293,0.5 | ||
72 | +6,T000292,5 | ||
73 | +6,T000212,0.5 | ||
74 | +7,T000053,5 | ||
75 | +7,T000054,5 | ||
76 | +7,T000060,0.5 | ||
77 | +7,T000078,0.5 | ||
78 | +7,T000071,0.5 | ||
79 | +7,T000298,0.5 | ||
80 | +7,T000288,0.5 | ||
81 | +7,T001608,0.5 | ||
82 | +7,T001606,5 | ||
83 | +7,T001917,0.5 | ||
84 | +7,T001915,0.5 | ||
85 | +7,T001914,5 | ||
86 | +8,T000040,0.5 | ||
87 | +8,T000044,0.5 | ||
88 | +8,T000053,0.5 | ||
89 | +8,T000059,5 | ||
90 | +8,T000061,5 | ||
91 | +8,T000072,0.5 | ||
92 | +8,T001631,0.5 | ||
93 | +8,T000301,0.5 | ||
94 | +8,T000295,5 | ||
95 | +8,T000294,5 | ||
96 | +8,T001616,5 | ||
97 | +8,T001944,5 | ||
98 | +9,T000049,5 | ||
99 | +9,T000051,0.5 | ||
100 | +9,T000054,5 | ||
101 | +9,T000055,5 | ||
102 | +9,T000056,5 | ||
103 | +9,T000311,5 | ||
104 | +9,T000309,5 | ||
105 | +9,T000297,5 | ||
106 | +9,T000289,5 | ||
107 | +9,T001614,0.5 | ||
108 | +9,T001612,0.5 | ||
109 | +9,T001956,0.5 | ||
110 | +10,T000045,0.5 | ||
111 | +10,T000053,0.5 | ||
112 | +10,T000061,5 | ||
113 | +10,T000069,0.5 | ||
114 | +10,T000068,0.5 | ||
115 | +10,T000312,0.5 | ||
116 | +10,T000303,0.5 | ||
117 | +10,T000297,0.5 | ||
118 | +10,T000287,5 | ||
119 | +10,T001615,5 | ||
120 | +10,T001609,5 | ||
121 | +10,T001915,5 | ||
122 | +11,T000036,5 | ||
123 | +11,T000046,0.5 | ||
124 | +11,T000054,5 | ||
125 | +11,T000077,0.5 | ||
126 | +11,T000069,0.5 | ||
127 | +11,T001628,5 | ||
128 | +11,T000308,5 | ||
129 | +11,T000301,0.5 | ||
130 | +11,T000288,5 | ||
131 | +11,T001610,0.5 | ||
132 | +11,T001600,5 | ||
133 | +11,T001914,5 | ||
134 | +12,T000045,0.5 | ||
135 | +12,T000048,0.5 | ||
136 | +12,T000056,5 | ||
137 | +12,T000075,5 | ||
138 | +12,T000063,0.5 | ||
139 | +12,T001631,5 | ||
140 | +12,T000309,0.5 | ||
141 | +12,T000302,0.5 | ||
142 | +12,T000291,5 | ||
143 | +12,T001616,0.5 | ||
144 | +12,T001918,5 | ||
145 | +12,T001947,5 | ||
146 | +13,T000043,5 | ||
147 | +13,T000052,5 | ||
148 | +13,T000059,5 | ||
149 | +13,T000071,5 | ||
150 | +13,T001628,0.5 | ||
151 | +13,T000307,5 | ||
152 | +13,T000299,0.5 | ||
153 | +13,T000291,5 | ||
154 | +13,T001614,0.5 | ||
155 | +13,T001604,5 | ||
156 | +13,T001921,0.5 | ||
157 | +13,T001911,0.5 |
-
Please register or login to post a comment