Showing
2 changed files
with
0 additions
and
110 deletions
src/main/java/Aggregation.java
deleted
100644 → 0
| 1 | -import org.apache.spark.api.java.function.MapFunction; | ||
| 2 | -import org.apache.spark.sql.Dataset; | ||
| 3 | -import org.apache.spark.sql.Encoders; | ||
| 4 | -import org.apache.spark.sql.Row; | ||
| 5 | -import org.apache.spark.sql.SparkSession; | ||
| 6 | -import org.apache.spark.sql.expressions.Window; | ||
| 7 | -import org.apache.spark.sql.expressions.WindowSpec; | ||
| 8 | - | ||
| 9 | -import java.util.ArrayList; | ||
| 10 | -import java.util.List; | ||
| 11 | - | ||
| 12 | -import static org.apache.spark.sql.functions.*; | ||
| 13 | -import static org.apache.spark.sql.functions.lit; | ||
| 14 | -import static org.apache.spark.sql.functions.when; | ||
| 15 | - | ||
| 16 | -public class Aggregation { | ||
| 17 | - | ||
| 18 | - public static void main(String[] args) throws Exception { | ||
| 19 | - | ||
| 20 | - //Create Session | ||
| 21 | - SparkSession spark = SparkSession | ||
| 22 | - .builder() | ||
| 23 | - .appName("Detecting Fraud Clicks") | ||
| 24 | - .master("local") | ||
| 25 | - .getOrCreate(); | ||
| 26 | - | ||
| 27 | - // Aggregation | ||
| 28 | - Aggregation agg = new Aggregation(); | ||
| 29 | - | ||
| 30 | - Dataset<Row> dataset = agg.loadCSVDataSet("/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv", spark); | ||
| 31 | - dataset = agg.changeTimestempToLong(dataset); | ||
| 32 | - dataset = agg.averageValidClickCount(dataset); | ||
| 33 | - dataset = agg.clickTimeDelta(dataset); | ||
| 34 | - dataset = agg.countClickInTenMinutes(dataset); | ||
| 35 | - | ||
| 36 | - long start = System.currentTimeMillis(); | ||
| 37 | - | ||
| 38 | - List<String> logs_with_features = dataset.map(row->row.toString(), Encoders.STRING()).collectAsList(); | ||
| 39 | - String[][] contents = new String[(int)dataset.count()][11]; | ||
| 40 | - for (int i =0; i<logs_with_features.size();i++){ | ||
| 41 | - String str_to_split = logs_with_features.get(i); | ||
| 42 | - String[] tmp = str_to_split.substring(1,str_to_split.length()-1).split(","); | ||
| 43 | - contents[i] = tmp; | ||
| 44 | - } | ||
| 45 | - | ||
| 46 | - long end = System.currentTimeMillis(); | ||
| 47 | - System.out.println("JK's Procedure time elapsed : " + (end-start)/1000.0); | ||
| 48 | - | ||
| 49 | - start = System.currentTimeMillis(); | ||
| 50 | - List<String> stringDataset = dataset.toJSON().collectAsList(); | ||
| 51 | - end = System.currentTimeMillis(); | ||
| 52 | - System.out.println("Steve's Procedure 1 time elapsed : " + (end-start)/1000.0); | ||
| 53 | - new GUI(stringDataset, contents); | ||
| 54 | - | ||
| 55 | - | ||
| 56 | - } | ||
| 57 | - | ||
| 58 | - | ||
| 59 | - private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){ | ||
| 60 | - // Read SCV to DataSet | ||
| 61 | - return spark.read().format("csv") | ||
| 62 | - .option("inferSchema", "true") | ||
| 63 | - .option("header", "true") | ||
| 64 | - .load(path); | ||
| 65 | - } | ||
| 66 | - | ||
| 67 | - private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){ | ||
| 68 | - // cast timestamp to long | ||
| 69 | - Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long")); | ||
| 70 | - newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long")); | ||
| 71 | - newDF = newDF.drop("click_time").drop("attributed_time"); | ||
| 72 | - return newDF; | ||
| 73 | - } | ||
| 74 | - | ||
| 75 | - private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){ | ||
| 76 | - // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row | ||
| 77 | - WindowSpec w = Window.partitionBy("ip", "app") | ||
| 78 | - .orderBy("utc_click_time") | ||
| 79 | - .rowsBetween(Window.unboundedPreceding(), Window.currentRow()); | ||
| 80 | - | ||
| 81 | - // aggregation | ||
| 82 | - Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w)); | ||
| 83 | - newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w)); | ||
| 84 | - newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click"))); | ||
| 85 | - newDF = newDF.drop("cum_count_click", "cum_sum_attributed"); | ||
| 86 | - return newDF; | ||
| 87 | - } | ||
| 88 | - | ||
| 89 | - private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){ | ||
| 90 | - WindowSpec w = Window.partitionBy ("ip") | ||
| 91 | - .orderBy("utc_click_time"); | ||
| 92 | - | ||
| 93 | - Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w)); | ||
| 94 | - newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(), | ||
| 95 | - lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(), | ||
| 96 | - lit(0)).otherwise(col("lag(utc_click_time)")))); | ||
| 97 | - newDF = newDF.drop("lag(utc_click_time)"); | ||
| 98 | - return newDF; | ||
| 99 | - } | ||
| 100 | - | ||
| 101 | - private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){ | ||
| 102 | - WindowSpec w = Window.partitionBy("ip") | ||
| 103 | - .orderBy("utc_click_time") | ||
| 104 | - .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
| 105 | - | ||
| 106 | - Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins", | ||
| 107 | - (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함. | ||
| 108 | - return newDF; | ||
| 109 | - } | ||
| 110 | -} | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment