윤정환

코드, 데이터셋 업로드

1 +#!/usr/bin/env python
2 +# coding: utf-8
3 +
4 +# In[ ]:
5 +
6 +
7 +import pandas as pd
8 +import numpy as np
9 +import tensorflow as tf
10 +from tensorflow import keras
11 +from tensorflow.keras import layers
12 +from pathlib import Path
13 +import matplotlib.pyplot as plt
14 +
15 +dataset_file = open("dataset.csv",'r')
16 +df = pd.read_csv(dataset_file)
17 +
18 +user_ids = df["userid"].unique().tolist()
19 +user2user_encoded = {x: i for i, x in enumerate(user_ids)}
20 +userencoded2user = {i: x for i, x in enumerate(user_ids)}
21 +contents_ids = df["contentsid"].unique().tolist()
22 +contents2contents_encoded = {x: i for i, x in enumerate(contents_ids)}
23 +contents_encoded2contents = {i: x for i, x in enumerate(contents_ids)}
24 +df["user"] = df["userid"].map(user2user_encoded)
25 +df["contents"] = df["contentsid"].map(contents2contents_encoded)
26 +
27 +num_users = len(user2user_encoded)
28 +num_contents = len(contents_encoded2contents)
29 +df["rating"] = df["rating"].values.astype(np.float32)
30 +# min and max ratings will be used to normalize the ratings later
31 +min_rating = 0.5
32 +max_rating = 5.0
33 +
34 +print(
35 + "Number of users: {}, Number of Contents: {}, Min rating: {}, Max rating: {}".format(
36 + num_users, num_contents, min_rating, max_rating
37 + )
38 +)
39 +
40 +df = df.sample(frac=1, random_state=42)
41 +x = df[["user", "contents"]].values
42 +# Normalize the targets between 0 and 1. Makes it easy to train.
43 +y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
44 +# Assuming training on 90% of the data and validating on 10%.
45 +train_indices = int(0.9 * df.shape[0])
46 +x_train, x_val, y_train, y_val = (
47 + x[:train_indices],
48 + x[train_indices:],
49 + y[:train_indices],
50 + y[train_indices:],
51 +)
52 +
53 +EMBEDDING_SIZE = 50
54 +
55 +
56 +class RecommenderNet(keras.Model):
57 + def __init__(self, num_users, num_contents, embedding_size, **kwargs):
58 + super(RecommenderNet, self).__init__(**kwargs)
59 + self.num_users = num_users
60 + self.num_contents = num_contents
61 + self.embedding_size = embedding_size
62 + self.user_embedding = layers.Embedding(
63 + num_users,
64 + embedding_size,
65 + embeddings_initializer="he_normal",
66 + embeddings_regularizer=keras.regularizers.l2(1e-6),
67 + )
68 + self.user_bias = layers.Embedding(num_users, 1)
69 + self.contents_embedding = layers.Embedding(
70 + num_contents,
71 + embedding_size,
72 + embeddings_initializer="he_normal",
73 + embeddings_regularizer=keras.regularizers.l2(1e-6),
74 + )
75 + self.contents_bias = layers.Embedding(num_contents, 1)
76 +
77 + def call(self, inputs):
78 + user_vector = self.user_embedding(inputs[:, 0])
79 + user_bias = self.user_bias(inputs[:, 0])
80 + contents_vector = self.contents_embedding(inputs[:, 1])
81 + contents_bias = self.contents_bias(inputs[:, 1])
82 + dot_user_contents = tf.tensordot(user_vector, contents_vector, 2)
83 + # Add all the components (including bias)
84 + x = dot_user_contents + user_bias + contents_bias
85 + # The sigmoid activation forces the rating to between 0 and 1
86 + return tf.nn.sigmoid(x)
87 +
88 +
89 +model = RecommenderNet(num_users, num_contents, EMBEDDING_SIZE)
90 +model.compile(
91 + optimizer='sgd',
92 + loss='mse',
93 + metrics=[tf.keras.metrics.MeanSquaredError()])
94 +
95 +history = model.fit(
96 + x=x_train,
97 + y=y_train,
98 + batch_size=2,
99 + epochs=20,
100 + verbose=1,
101 + validation_data=(x_val, y_val),
102 +)
103 +
104 +plt.plot(history.history["loss"])
105 +plt.plot(history.history["val_loss"])
106 +plt.title("model loss")
107 +plt.ylabel("loss")
108 +plt.xlabel("epoch")
109 +plt.legend(["train", "test"], loc="upper left")
110 +plt.show()
111 +
112 +test_file = open("dataset_test.csv",'r')
113 +tf = pd.read_csv(test_file)
114 +
115 +user_ids = tf["userid"].unique().tolist()
116 +user2user_encoded = {x: i for i, x in enumerate(user_ids)}
117 +userencoded2user = {i: x for i, x in enumerate(user_ids)}
118 +contents_ids = tf["contentsid"].unique().tolist()
119 +contents2contents_encoded = {x: i for i, x in enumerate(contents_ids)}
120 +contents_encoded2contents = {i: x for i, x in enumerate(contents_ids)}
121 +tf["user"] = tf["userid"].map(user2user_encoded)
122 +tf["contents"] = tf["contentsid"].map(contents2contents_encoded)
123 +tf["rating"] = tf["rating"].values.astype(np.float32)
124 +
125 +tf = tf.sample(frac=1, random_state=42)
126 +x = tf[["user", "contents"]].values
127 +y = tf["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
128 +
129 +x_test, y_test = (x, y)
130 +result = model.evaluate(x_test, y_test)
131 +print(result)
132 +
1 +#!/usr/bin/env python
2 +# coding: utf-8
3 +
4 +# In[ ]:
5 +
6 +
7 +import pandas as pd
8 +import numpy as np
9 +import tensorflow as tf
10 +from tensorflow import keras
11 +from tensorflow.keras import layers
12 +from pathlib import Path
13 +import matplotlib.pyplot as plt
14 +
15 +df_x = pd.read_csv("x_train.csv")
16 +df_y = pd.read_csv("y_train.csv")
17 +df = pd.concat([df_x, df_y], axis=1)
18 +
19 +user_ids = df["userid"].unique().tolist()
20 +user2user_encoded = {x: i for i, x in enumerate(user_ids)}
21 +userencoded2user = {i: x for i, x in enumerate(user_ids)}
22 +task_ids = df["taskid"].unique().tolist()
23 +task2task_encoded = {x: i for i, x in enumerate(task_ids)}
24 +task_encoded2task = {i: x for i, x in enumerate(task_ids)}
25 +df["user"] = df["userid"].map(user2user_encoded)
26 +df["task"] = df["taskid"].map(task2task_encoded)
27 +
28 +num_users = len(user2user_encoded)
29 +num_task = len(task_encoded2task)
30 +df["rating"] = df["rating"].values.astype(np.float32)
31 +# min and max ratings will be used to normalize the ratings later
32 +MIN_RATING = 0.5
33 +MAX_RATING = 5.0
34 +
35 +print(
36 + "Number of users: {}, Number of task: {}, Min rating: {}, Max rating: {}".format(
37 + num_users, num_task, MIN_RATING, MAX_RATING
38 + )
39 +)
40 +
41 +df = df.sample(frac=1, random_state=42)
42 +x = df[["user", "task"]].values
43 +# Normalize the targets between 0 and 1. Makes it easy to train.
44 +y = df["rating"].apply(lambda x: (x - MIN_RATING) / (MAX_RATING - MIN_RATING)).values
45 +# Assuming training on 90% of the data and validating on 10%.
46 +train_indices = int(0.9 * df.shape[0])
47 +x_train, x_val, y_train, y_val = (
48 + x[:train_indices],
49 + x[train_indices:],
50 + y[:train_indices],
51 + y[train_indices:],
52 +)
53 +
54 +EMBEDDING_SIZE = 128
55 +
56 +class RecommenderNet(keras.Model):
57 + def __init__(self, num_users, num_task, embedding_size, **kwargs):
58 + super(RecommenderNet, self).__init__(**kwargs)
59 + self.num_users = num_users
60 + self.num_task = num_task
61 + self.embedding_size = embedding_size
62 + self.user_embedding = layers.Embedding(
63 + num_users,
64 + embedding_size,
65 + embeddings_initializer="he_normal",
66 + embeddings_regularizer=keras.regularizers.l2(1e-6),
67 + )
68 + self.user_bias = layers.Embedding(num_users, 1)
69 + self.task_embedding = layers.Embedding(
70 + num_task,
71 + embedding_size,
72 + embeddings_initializer="he_normal",
73 + embeddings_regularizer=keras.regularizers.l2(1e-6),
74 + )
75 + self.task_bias = layers.Embedding(num_task, 1)
76 +
77 + def call(self, inputs):
78 + user_vector = self.user_embedding(inputs[:, 0])
79 + user_bias = self.user_bias(inputs[:, 0])
80 + task_vector = self.task_embedding(inputs[:, 1])
81 + task_bias = self.task_bias(inputs[:, 1])
82 + dot_user_task = tf.tensordot(user_vector, task_vector, 2)
83 + # Add all the components (including bias)
84 + x = dot_user_task + user_bias + task_bias
85 + # The sigmoid activation forces the rating to between 0 and 1
86 + return tf.nn.sigmoid(x)
87 +
88 +
89 +model = RecommenderNet(num_users, num_task, EMBEDDING_SIZE)
90 +model.compile(
91 + optimizer='adam',
92 + loss='mse',
93 + metrics=[tf.keras.metrics.MeanSquaredError()])
94 +
95 +history = model.fit(
96 + x=x_train,
97 + y=y_train,
98 + batch_size=8,
99 + epochs=300,
100 + verbose=1,
101 + validation_data=(x_val, y_val),
102 +)
103 +
104 +df_x_test = pd.read_csv('x_test.csv')
105 +
106 +df_x_test["user"] = df_x_test["userid"].map(user2user_encoded)
107 +df_x_test["task"] = df_x_test["taskid"].map(task2task_encoded)
108 +
109 +x_test = df_x_test[["user", "task"]].values
110 +
111 +y_pred = model.predict(x_test)
112 +
113 +df_y_pred = pd.DataFrame(y_pred, columns=['rating'])
114 +df_y_pred = df_y_pred["rating"].apply(lambda x: (x * (MAX_RATING - MIN_RATING) + MIN_RATING ))
115 +df_y_pred.to_csv('y_pred.csv', sep=',', columns = ['rating'], index = False)
116 +
117 +#evaluate
118 +import os
119 +import sys
120 +import pandas as pd
121 +from sklearn.metrics import mean_squared_error
122 +
123 +gt = pd.read_csv('y_test.csv', header=0)
124 +pr = pd.read_csv('y_pred.csv', header=0)
125 +
126 +gt = gt.to_numpy().astype(float).reshape(-1)
127 +pr = pr.to_numpy().astype(float).reshape(-1)
128 +
129 +score = mean_squared_error(gt, pr, squared = False)
130 +print("score:", score)
131 +
This diff is collapsed. Click to expand it.
1 +userid,contentsid,rating
2 +1,T000043,5
3 +1,T000055,0.5
4 +1,T000072,0.5
5 +1,T000064,5
6 +1,T001630,0.5
7 +1,T000308,0.5
8 +1,T000293,0.5
9 +1,T001616,0.5
10 +1,T001613,5
11 +1,T001601,0.5
12 +1,T001919,0.5
13 +1,T001946,5
14 +2,T000046,5
15 +2,T000051,0.5
16 +2,T000074,0.5
17 +2,T000308,0.5
18 +2,T000307,5
19 +2,T000299,0.5
20 +2,T000291,0.5
21 +2,T001613,5
22 +2,T001607,0.5
23 +2,T001920,5
24 +2,T001916,0.5
25 +2,T001943,0.5
26 +3,T000036,5
27 +3,T000049,0.5
28 +3,T000053,0.5
29 +3,T000061,5
30 +3,T000073,0.5
31 +3,T001628,5
32 +3,T000302,5
33 +3,T000212,0.5
34 +3,T001616,0.5
35 +3,T001606,0.5
36 +3,T001920,0.5
37 +3,T001915,0.5
38 +4,T001947,0.5
39 +4,T001921,0.5
40 +4,T001617,0.5
41 +4,T001606,0.5
42 +4,T000040,0.5
43 +4,T000045,0.5
44 +4,T000060,5
45 +4,T000077,0.5
46 +4,T000068,0.5
47 +4,T000302,0.5
48 +4,T000293,0.5
49 +4,T000288,0.5
50 +5,T001956,5
51 +5,T001915,0.5
52 +5,T001611,5
53 +5,T001604,5
54 +5,T000046,0.5
55 +5,T000056,5
56 +5,T000073,0.5
57 +5,T000065,0.5
58 +5,T001630,0.5
59 +5,T000309,0.5
60 +5,T000299,5
61 +5,T000294,0.5
62 +6,T001943,5
63 +6,T001911,0.5
64 +6,T000036,0.5
65 +6,T000050,5
66 +6,T000056,0.5
67 +6,T000059,5
68 +6,T000074,5
69 +6,T000072,0.5
70 +6,T000071,0.5
71 +6,T000293,0.5
72 +6,T000292,5
73 +6,T000212,0.5
74 +7,T000053,5
75 +7,T000054,5
76 +7,T000060,0.5
77 +7,T000078,0.5
78 +7,T000071,0.5
79 +7,T000298,0.5
80 +7,T000288,0.5
81 +7,T001608,0.5
82 +7,T001606,5
83 +7,T001917,0.5
84 +7,T001915,0.5
85 +7,T001914,5
86 +8,T000040,0.5
87 +8,T000044,0.5
88 +8,T000053,0.5
89 +8,T000059,5
90 +8,T000061,5
91 +8,T000072,0.5
92 +8,T001631,0.5
93 +8,T000301,0.5
94 +8,T000295,5
95 +8,T000294,5
96 +8,T001616,5
97 +8,T001944,5
98 +9,T000049,5
99 +9,T000051,0.5
100 +9,T000054,5
101 +9,T000055,5
102 +9,T000056,5
103 +9,T000311,5
104 +9,T000309,5
105 +9,T000297,5
106 +9,T000289,5
107 +9,T001614,0.5
108 +9,T001612,0.5
109 +9,T001956,0.5
110 +10,T000045,0.5
111 +10,T000053,0.5
112 +10,T000061,5
113 +10,T000069,0.5
114 +10,T000068,0.5
115 +10,T000312,0.5
116 +10,T000303,0.5
117 +10,T000297,0.5
118 +10,T000287,5
119 +10,T001615,5
120 +10,T001609,5
121 +10,T001915,5
122 +11,T000036,5
123 +11,T000046,0.5
124 +11,T000054,5
125 +11,T000077,0.5
126 +11,T000069,0.5
127 +11,T001628,5
128 +11,T000308,5
129 +11,T000301,0.5
130 +11,T000288,5
131 +11,T001610,0.5
132 +11,T001600,5
133 +11,T001914,5
134 +12,T000045,0.5
135 +12,T000048,0.5
136 +12,T000056,5
137 +12,T000075,5
138 +12,T000063,0.5
139 +12,T001631,5
140 +12,T000309,0.5
141 +12,T000302,0.5
142 +12,T000291,5
143 +12,T001616,0.5
144 +12,T001918,5
145 +12,T001947,5
146 +13,T000043,5
147 +13,T000052,5
148 +13,T000059,5
149 +13,T000071,5
150 +13,T001628,0.5
151 +13,T000307,5
152 +13,T000299,0.5
153 +13,T000291,5
154 +13,T001614,0.5
155 +13,T001604,5
156 +13,T001921,0.5
157 +13,T001911,0.5