신은섭(Shin Eun Seop)

add average ad efficient field

closed Java-Cesco/Detecting_fraud_clicks#3
......@@ -12,6 +12,8 @@
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-core_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.avro:avro:1.7.7" level="project" />
<orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" />
......
......@@ -3,6 +3,13 @@
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file:///tmp" />
</component>
......
import org.apache.commons.net.ntp.TimeStamp;
import org.apache.spark.Aggregator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.IntegerType;
import org.apache.spark.sql.types.LongType;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import java.io.Serializable;
import java.sql.Time;
import java.sql.Timestamp;
import static org.apache.spark.sql.functions.unix_timestamp;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.count;
import static org.apache.spark.sql.functions.sum;
public class AvgAdvTime {
......@@ -29,11 +23,21 @@ public class AvgAdvTime {
.option("inferSchema", "true")
.option("header", "true")
.load("train_sample.csv");
df.printSchema();
// cast timestamp to long
Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
newdf.show();
newdf = newdf.drop("click_time").drop("attributed_time");
WindowSpec w = Window.partitionBy("ip", "app")
.orderBy("utc_click_time")
.rowsBetween(Window.unboundedPreceding(), Window.currentRow());
newdf = newdf.withColumn("cum_count_click", count("utc_click_time").over(w));
newdf = newdf.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
newdf = newdf.withColumn("avg_efficient", col("cum_sum_attributed").divide(col("cum_count_click")));
newdf.where("ip == '5348' and app == '19'").show();
newdf.printSchema();
}
}
\ No newline at end of file
......