신은섭(Shin Eun Seop)

refactoring aggregation functions

close Java-Cesco/Detecting_fraud_clicks#3
This diff could not be displayed because it is too large.
ip,app,device,os,channel,click_time,attributed_time,is_attributed
117898,12,1,13,497,2017-11-07 09:30:38,,0
117898,12,1,13,497,2017-11-07 09:30:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:39:38,,0
117898,12,1,13,497,2017-11-07 09:40:38,,0
\ No newline at end of file
......@@ -19,25 +19,26 @@ public class Aggregation {
.master("local")
.getOrCreate();
// Aggregation
Aggregation agg = new Aggregation();
Dataset<Row> dataset = agg.loadCSVDataSet("./train_sample.csv", spark);
dataset = agg.changeTimestempToLong(dataset);
dataset = agg.averageValidClickCount(dataset);
dataset = agg.clickTimeDelta(dataset);
dataset.where("ip == '5348' and app == '19'").show();
dataset = agg.countClickInTenMinutes(dataset);
//test
dataset.where("ip == '5348' and app == '19'").show(10);
}
private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
// Read SCV to DataSet
Dataset<Row> dataset = spark.read().format("csv")
return spark.read().format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load("train_sample.csv");
return dataset;
.load(path);
}
private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
......@@ -73,4 +74,14 @@ public class Aggregation {
newDF = newDF.drop("lag(utc_click_time)");
return newDF;
}
private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
WindowSpec w = Window.partitionBy("ip")
.orderBy("utc_click_time")
.rangeBetween(Window.currentRow(),Window.currentRow()+600);
Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
(count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함.
return newDF;
}
}
......
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.count;
import static org.apache.spark.sql.functions.sum;
public class AvgAdvTime {
public static void main(String[] args) throws Exception {
// Start Spark Session
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("Java Spark SQL basic example")
.getOrCreate();
// Read SCV to DataSet
Dataset<Row> df = spark.read().format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load("train_sample.csv");
// cast timestamp to long
Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
newdf = newdf.drop("click_time").drop("attributed_time");
// set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
WindowSpec w = Window.partitionBy("ip", "app")
.orderBy("utc_click_time")
.rowsBetween(Window.unboundedPreceding(), Window.currentRow());
// aggregation
newdf = newdf.withColumn("cum_count_click", count("utc_click_time").over(w));
newdf = newdf.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
newdf = newdf.withColumn("avg_efficient", col("cum_sum_attributed").divide(col("cum_count_click")));
// print example
newdf.where("ip == '5348' and app == '19'").show();
newdf.printSchema();
}
}
\ No newline at end of file
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import static org.apache.spark.sql.functions.*;
public class CountTen {
public static void main(String[] args) throws Exception {
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("Java Spark SQL basic example")
.getOrCreate();
Dataset<Row> df = spark.read().format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load("./data/train_.csv");
// cast timestamp to long
Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
newdf = newdf.drop("click_time").drop("attributed_time");
WindowSpec w = Window.partitionBy("ip")
.orderBy("utc_click_time")
.rangeBetween(Window.currentRow(),Window.currentRow()+600);
// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808
newdf = newdf.withColumn("is_clicked_in_ten_mins",
(count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함.
// newdf = newdf.withColumn("is_clicked_in_ten_mins",
// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
newdf.where("ip == '117898'").show(false);
}
}
\ No newline at end of file
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import javax.xml.crypto.Data;
import static org.apache.spark.sql.functions.*;
public class calForwardTimeDelta {
static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco");
static JavaSparkContext sc = new JavaSparkContext(conf);
public static void main(String[] args) throws Exception{
//Create Session
SparkSession spark = SparkSession
.builder()
.appName("Detecting Fraud Clicks")
.getOrCreate();
//run methods hereu
calcDelta(spark);
}
private static void calcDelta(SparkSession spark){
// put the path the file you gonna deal with being placed
String filepath = "train_sample.csv";
// create Dataset from files
Dataset<Row> logDF = spark.read()
.format("csv")
.option("inferSchema", "true")
.option("header","true")
.load(filepath);
// cast timestamp(click_time, attributed_time) type to long type
//add column for long(click_time)
Dataset<Row> newDF = logDF.withColumn("utc_click_time", logDF.col("click_time").cast("long"));
//add column for long(attributed_time)
newDF = newDF.withColumn("utc_attributed_time", logDF.col("attributed_time").cast("long"));
//drop timestamp type columns
newDF = newDF.drop("click_time").drop("attributed_time");
newDF.createOrReplaceTempView("logs");
WindowSpec w = Window.partitionBy ("ip")
.orderBy("utc_click_time");
newDF = newDF.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
newDF.where("ip=10").show();
newDF = newDF.withColumn("delta", when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("lag(utc_click_time)"))));
//newDF = newDF.withColumn("delta", datediff());
newDF = newDF.drop("lag(utc_click_time)");
newDF = newDF.orderBy("ip");
newDF.show();
}
}