refactoring aggregation functions
close Java-Cesco/Detecting_fraud_clicks#3
Showing
6 changed files
with
16 additions
and
166 deletions
data/train.csv
deleted
100644 → 0
This diff could not be displayed because it is too large.
data/train_.csv
deleted
100644 → 0
| 1 | -ip,app,device,os,channel,click_time,attributed_time,is_attributed | ||
| 2 | -117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
| 3 | -117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
| 4 | -117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
| 5 | -117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
| 6 | -117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
| 7 | -117898,12,1,13,497,2017-11-07 09:39:38,,0 | ||
| 8 | -117898,12,1,13,497,2017-11-07 09:40:38,,0 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -19,25 +19,26 @@ public class Aggregation { | ... | @@ -19,25 +19,26 @@ public class Aggregation { |
| 19 | .master("local") | 19 | .master("local") |
| 20 | .getOrCreate(); | 20 | .getOrCreate(); |
| 21 | 21 | ||
| 22 | + // Aggregation | ||
| 22 | Aggregation agg = new Aggregation(); | 23 | Aggregation agg = new Aggregation(); |
| 23 | 24 | ||
| 24 | Dataset<Row> dataset = agg.loadCSVDataSet("./train_sample.csv", spark); | 25 | Dataset<Row> dataset = agg.loadCSVDataSet("./train_sample.csv", spark); |
| 25 | dataset = agg.changeTimestempToLong(dataset); | 26 | dataset = agg.changeTimestempToLong(dataset); |
| 26 | dataset = agg.averageValidClickCount(dataset); | 27 | dataset = agg.averageValidClickCount(dataset); |
| 27 | dataset = agg.clickTimeDelta(dataset); | 28 | dataset = agg.clickTimeDelta(dataset); |
| 28 | - | 29 | + dataset = agg.countClickInTenMinutes(dataset); |
| 29 | - dataset.where("ip == '5348' and app == '19'").show(); | ||
| 30 | 30 | ||
| 31 | + //test | ||
| 32 | + dataset.where("ip == '5348' and app == '19'").show(10); | ||
| 31 | } | 33 | } |
| 32 | 34 | ||
| 33 | 35 | ||
| 34 | private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){ | 36 | private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){ |
| 35 | // Read SCV to DataSet | 37 | // Read SCV to DataSet |
| 36 | - Dataset<Row> dataset = spark.read().format("csv") | 38 | + return spark.read().format("csv") |
| 37 | .option("inferSchema", "true") | 39 | .option("inferSchema", "true") |
| 38 | .option("header", "true") | 40 | .option("header", "true") |
| 39 | - .load("train_sample.csv"); | 41 | + .load(path); |
| 40 | - return dataset; | ||
| 41 | } | 42 | } |
| 42 | 43 | ||
| 43 | private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){ | 44 | private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){ |
| ... | @@ -73,4 +74,14 @@ public class Aggregation { | ... | @@ -73,4 +74,14 @@ public class Aggregation { |
| 73 | newDF = newDF.drop("lag(utc_click_time)"); | 74 | newDF = newDF.drop("lag(utc_click_time)"); |
| 74 | return newDF; | 75 | return newDF; |
| 75 | } | 76 | } |
| 77 | + | ||
| 78 | + private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){ | ||
| 79 | + WindowSpec w = Window.partitionBy("ip") | ||
| 80 | + .orderBy("utc_click_time") | ||
| 81 | + .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
| 82 | + | ||
| 83 | + Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins", | ||
| 84 | + (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함. | ||
| 85 | + return newDF; | ||
| 86 | + } | ||
| 76 | } | 87 | } | ... | ... |
src/main/java/AvgAdvTime.java
deleted
100644 → 0
| 1 | -import org.apache.spark.sql.Dataset; | ||
| 2 | -import org.apache.spark.sql.Row; | ||
| 3 | -import org.apache.spark.sql.SparkSession; | ||
| 4 | -import org.apache.spark.sql.expressions.Window; | ||
| 5 | -import org.apache.spark.sql.expressions.WindowSpec; | ||
| 6 | - | ||
| 7 | -import static org.apache.spark.sql.functions.col; | ||
| 8 | -import static org.apache.spark.sql.functions.count; | ||
| 9 | -import static org.apache.spark.sql.functions.sum; | ||
| 10 | - | ||
| 11 | - | ||
| 12 | -public class AvgAdvTime { | ||
| 13 | - | ||
| 14 | - public static void main(String[] args) throws Exception { | ||
| 15 | - | ||
| 16 | - // Start Spark Session | ||
| 17 | - SparkSession spark = SparkSession | ||
| 18 | - .builder() | ||
| 19 | - .master("local") | ||
| 20 | - .appName("Java Spark SQL basic example") | ||
| 21 | - .getOrCreate(); | ||
| 22 | - | ||
| 23 | - // Read SCV to DataSet | ||
| 24 | - Dataset<Row> df = spark.read().format("csv") | ||
| 25 | - .option("inferSchema", "true") | ||
| 26 | - .option("header", "true") | ||
| 27 | - .load("train_sample.csv"); | ||
| 28 | - | ||
| 29 | - // cast timestamp to long | ||
| 30 | - Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); | ||
| 31 | - newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long")); | ||
| 32 | - newdf = newdf.drop("click_time").drop("attributed_time"); | ||
| 33 | - | ||
| 34 | - // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row | ||
| 35 | - WindowSpec w = Window.partitionBy("ip", "app") | ||
| 36 | - .orderBy("utc_click_time") | ||
| 37 | - .rowsBetween(Window.unboundedPreceding(), Window.currentRow()); | ||
| 38 | - | ||
| 39 | - // aggregation | ||
| 40 | - newdf = newdf.withColumn("cum_count_click", count("utc_click_time").over(w)); | ||
| 41 | - newdf = newdf.withColumn("cum_sum_attributed", sum("is_attributed").over(w)); | ||
| 42 | - newdf = newdf.withColumn("avg_efficient", col("cum_sum_attributed").divide(col("cum_count_click"))); | ||
| 43 | - | ||
| 44 | - // print example | ||
| 45 | - newdf.where("ip == '5348' and app == '19'").show(); | ||
| 46 | - newdf.printSchema(); | ||
| 47 | - | ||
| 48 | - } | ||
| 49 | -} | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
src/main/java/CountTen.java
deleted
100644 → 0
| 1 | -import org.apache.spark.sql.Column; | ||
| 2 | -import org.apache.spark.sql.Dataset; | ||
| 3 | -import org.apache.spark.sql.Row; | ||
| 4 | -import org.apache.spark.sql.SparkSession; | ||
| 5 | -import org.apache.spark.sql.expressions.Window; | ||
| 6 | -import org.apache.spark.sql.expressions.WindowSpec; | ||
| 7 | - | ||
| 8 | -import static org.apache.spark.sql.functions.*; | ||
| 9 | - | ||
| 10 | - | ||
| 11 | -public class CountTen { | ||
| 12 | - | ||
| 13 | - public static void main(String[] args) throws Exception { | ||
| 14 | - SparkSession spark = SparkSession | ||
| 15 | - .builder() | ||
| 16 | - .master("local") | ||
| 17 | - .appName("Java Spark SQL basic example") | ||
| 18 | - .getOrCreate(); | ||
| 19 | - | ||
| 20 | - Dataset<Row> df = spark.read().format("csv") | ||
| 21 | - .option("inferSchema", "true") | ||
| 22 | - .option("header", "true") | ||
| 23 | - .load("./data/train_.csv"); | ||
| 24 | - | ||
| 25 | - // cast timestamp to long | ||
| 26 | - Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); | ||
| 27 | - newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long")); | ||
| 28 | - newdf = newdf.drop("click_time").drop("attributed_time"); | ||
| 29 | - | ||
| 30 | - WindowSpec w = Window.partitionBy("ip") | ||
| 31 | - .orderBy("utc_click_time") | ||
| 32 | - .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
| 33 | -// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808 | ||
| 34 | - | ||
| 35 | - newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
| 36 | - (count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함. | ||
| 37 | -// newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
| 38 | -// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long")); | ||
| 39 | - | ||
| 40 | - newdf.where("ip == '117898'").show(false); | ||
| 41 | - } | ||
| 42 | -} | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | -import org.apache.spark.SparkConf; | ||
| 2 | -import org.apache.spark.api.java.JavaSparkContext; | ||
| 3 | -import org.apache.spark.sql.Dataset; | ||
| 4 | -import org.apache.spark.sql.Row; | ||
| 5 | -import org.apache.spark.sql.SparkSession; | ||
| 6 | -import org.apache.spark.sql.expressions.Window; | ||
| 7 | -import org.apache.spark.sql.expressions.WindowSpec; | ||
| 8 | - | ||
| 9 | -import javax.xml.crypto.Data; | ||
| 10 | - | ||
| 11 | -import static org.apache.spark.sql.functions.*; | ||
| 12 | - | ||
| 13 | -public class calForwardTimeDelta { | ||
| 14 | - static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco"); | ||
| 15 | - static JavaSparkContext sc = new JavaSparkContext(conf); | ||
| 16 | - | ||
| 17 | - public static void main(String[] args) throws Exception{ | ||
| 18 | - //Create Session | ||
| 19 | - SparkSession spark = SparkSession | ||
| 20 | - .builder() | ||
| 21 | - .appName("Detecting Fraud Clicks") | ||
| 22 | - .getOrCreate(); | ||
| 23 | - | ||
| 24 | - //run methods hereu | ||
| 25 | - calcDelta(spark); | ||
| 26 | - } | ||
| 27 | - | ||
| 28 | - private static void calcDelta(SparkSession spark){ | ||
| 29 | - // put the path the file you gonna deal with being placed | ||
| 30 | - String filepath = "train_sample.csv"; | ||
| 31 | - | ||
| 32 | - // create Dataset from files | ||
| 33 | - Dataset<Row> logDF = spark.read() | ||
| 34 | - .format("csv") | ||
| 35 | - .option("inferSchema", "true") | ||
| 36 | - .option("header","true") | ||
| 37 | - .load(filepath); | ||
| 38 | - | ||
| 39 | - // cast timestamp(click_time, attributed_time) type to long type | ||
| 40 | - | ||
| 41 | - //add column for long(click_time) | ||
| 42 | - Dataset<Row> newDF = logDF.withColumn("utc_click_time", logDF.col("click_time").cast("long")); | ||
| 43 | - //add column for long(attributed_time) | ||
| 44 | - newDF = newDF.withColumn("utc_attributed_time", logDF.col("attributed_time").cast("long")); | ||
| 45 | - //drop timestamp type columns | ||
| 46 | - newDF = newDF.drop("click_time").drop("attributed_time"); | ||
| 47 | - newDF.createOrReplaceTempView("logs"); | ||
| 48 | - | ||
| 49 | - WindowSpec w = Window.partitionBy ("ip") | ||
| 50 | - .orderBy("utc_click_time"); | ||
| 51 | - | ||
| 52 | - newDF = newDF.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w)); | ||
| 53 | - newDF.where("ip=10").show(); | ||
| 54 | - newDF = newDF.withColumn("delta", when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("lag(utc_click_time)")))); | ||
| 55 | - //newDF = newDF.withColumn("delta", datediff()); | ||
| 56 | - newDF = newDF.drop("lag(utc_click_time)"); | ||
| 57 | - newDF = newDF.orderBy("ip"); | ||
| 58 | - | ||
| 59 | - newDF.show(); | ||
| 60 | - } | ||
| 61 | - | ||
| 62 | -} |
-
Please register or login to post a comment