add average ad efficient field

closed Java-Cesco/Detecting_fraud_clicks#3

add average ad efficient field
closed Java-Cesco/Detecting_fraud_clicks#3
신은섭(Shin Eun Seop)
Commit dd04a0b37a1e763f0c35641e64a0391e1dee74e0 dd04a0b3 1 parent 68f248cd
Showing 3 changed files with 28 additions and 15 deletions
.idea/Detecting_fraud_clicks.iml
.idea/misc.xml
src/main/java/AvgAdvTime.java
--- a/.idea/Detecting_fraud_clicks.iml
View file @dd04a0b
+++ b/.idea/Detecting_fraud_clicks.iml
View file @dd04a0b
@@ -12,6 +12,8 @@
     </content>
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
+     <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
+     <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
     <orderEntry type="library" name="Maven: org.apache.spark:spark-core_2.11:2.3.0" level="project" />
     <orderEntry type="library" name="Maven: org.apache.avro:avro:1.7.7" level="project" />
     <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" />
--- a/.idea/misc.xml
View file @dd04a0b
+++ b/.idea/misc.xml
View file @dd04a0b
@@ -3,6 +3,13 @@
   <component name="JavaScriptSettings">
     <option name="languageLevel" value="ES6" />
   </component>
+   <component name="MavenProjectsManager">
+     <option name="originalFiles">
+       <list>
+         <option value="$PROJECT_DIR$/pom.xml" />
+       </list>
+     </option>
+   </component>
   <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK">
     <output url="file:///tmp" />
   </component>
--- a/src/main/java/AvgAdvTime.java
View file @dd04a0b
+++ b/src/main/java/AvgAdvTime.java
View file @dd04a0b
- import org.apache.commons.net.ntp.TimeStamp;
- import org.apache.spark.Aggregator;
- import org.apache.spark.SparkConf;
- import org.apache.spark.api.java.JavaSparkContext;
- import org.apache.spark.api.java.function.MapFunction;
- import org.apache.spark.sql.*;
+ import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
- import org.apache.spark.sql.types.IntegerType;
- import org.apache.spark.sql.types.LongType;
+ import org.apache.spark.sql.SparkSession;
+ import org.apache.spark.sql.expressions.Window;
+ import org.apache.spark.sql.expressions.WindowSpec;
 
- import java.io.Serializable;
- import java.sql.Time;
- import java.sql.Timestamp;
- 
- import static org.apache.spark.sql.functions.unix_timestamp;
+ import static org.apache.spark.sql.functions.col;
+ import static org.apache.spark.sql.functions.count;
+ import static org.apache.spark.sql.functions.sum;
 
 
 public class AvgAdvTime {
@@ -29,11 +23,21 @@ public class AvgAdvTime {
                 .option("inferSchema", "true")
                 .option("header", "true")
                 .load("train_sample.csv");
-         df.printSchema();
         
         // cast timestamp to long
         Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
         newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
-         newdf.show();
+         newdf = newdf.drop("click_time").drop("attributed_time");
+ 
+         WindowSpec w = Window.partitionBy("ip", "app")
+                 .orderBy("utc_click_time")
+                 .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
+         
+         newdf = newdf.withColumn("cum_count_click", count("utc_click_time").over(w));
+         newdf = newdf.withColumn("cum_sum_attributed", sum("is_attributed").over(w));        
+         newdf = newdf.withColumn("avg_efficient", col("cum_sum_attributed").divide(col("cum_count_click")));
+         newdf.where("ip == '5348' and app == '19'").show();
+         newdf.printSchema();
+         
     }
 }
\ No newline at end of file