add average ad efficient field
closed Java-Cesco/Detecting_fraud_clicks#3
Showing
3 changed files
with
28 additions
and
15 deletions
| ... | @@ -12,6 +12,8 @@ | ... | @@ -12,6 +12,8 @@ |
| 12 | </content> | 12 | </content> |
| 13 | <orderEntry type="inheritedJdk" /> | 13 | <orderEntry type="inheritedJdk" /> |
| 14 | <orderEntry type="sourceFolder" forTests="false" /> | 14 | <orderEntry type="sourceFolder" forTests="false" /> |
| 15 | + <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" /> | ||
| 16 | + <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" /> | ||
| 15 | <orderEntry type="library" name="Maven: org.apache.spark:spark-core_2.11:2.3.0" level="project" /> | 17 | <orderEntry type="library" name="Maven: org.apache.spark:spark-core_2.11:2.3.0" level="project" /> |
| 16 | <orderEntry type="library" name="Maven: org.apache.avro:avro:1.7.7" level="project" /> | 18 | <orderEntry type="library" name="Maven: org.apache.avro:avro:1.7.7" level="project" /> |
| 17 | <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" /> | 19 | <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" /> | ... | ... |
| ... | @@ -3,6 +3,13 @@ | ... | @@ -3,6 +3,13 @@ |
| 3 | <component name="JavaScriptSettings"> | 3 | <component name="JavaScriptSettings"> |
| 4 | <option name="languageLevel" value="ES6" /> | 4 | <option name="languageLevel" value="ES6" /> |
| 5 | </component> | 5 | </component> |
| 6 | + <component name="MavenProjectsManager"> | ||
| 7 | + <option name="originalFiles"> | ||
| 8 | + <list> | ||
| 9 | + <option value="$PROJECT_DIR$/pom.xml" /> | ||
| 10 | + </list> | ||
| 11 | + </option> | ||
| 12 | + </component> | ||
| 6 | <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK"> | 13 | <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK"> |
| 7 | <output url="file:///tmp" /> | 14 | <output url="file:///tmp" /> |
| 8 | </component> | 15 | </component> | ... | ... |
| 1 | -import org.apache.commons.net.ntp.TimeStamp; | 1 | +import org.apache.spark.sql.Dataset; |
| 2 | -import org.apache.spark.Aggregator; | ||
| 3 | -import org.apache.spark.SparkConf; | ||
| 4 | -import org.apache.spark.api.java.JavaSparkContext; | ||
| 5 | -import org.apache.spark.api.java.function.MapFunction; | ||
| 6 | -import org.apache.spark.sql.*; | ||
| 7 | import org.apache.spark.sql.Row; | 2 | import org.apache.spark.sql.Row; |
| 8 | -import org.apache.spark.sql.types.IntegerType; | 3 | +import org.apache.spark.sql.SparkSession; |
| 9 | -import org.apache.spark.sql.types.LongType; | 4 | +import org.apache.spark.sql.expressions.Window; |
| 5 | +import org.apache.spark.sql.expressions.WindowSpec; | ||
| 10 | 6 | ||
| 11 | -import java.io.Serializable; | 7 | +import static org.apache.spark.sql.functions.col; |
| 12 | -import java.sql.Time; | 8 | +import static org.apache.spark.sql.functions.count; |
| 13 | -import java.sql.Timestamp; | 9 | +import static org.apache.spark.sql.functions.sum; |
| 14 | - | ||
| 15 | -import static org.apache.spark.sql.functions.unix_timestamp; | ||
| 16 | 10 | ||
| 17 | 11 | ||
| 18 | public class AvgAdvTime { | 12 | public class AvgAdvTime { |
| ... | @@ -29,11 +23,21 @@ public class AvgAdvTime { | ... | @@ -29,11 +23,21 @@ public class AvgAdvTime { |
| 29 | .option("inferSchema", "true") | 23 | .option("inferSchema", "true") |
| 30 | .option("header", "true") | 24 | .option("header", "true") |
| 31 | .load("train_sample.csv"); | 25 | .load("train_sample.csv"); |
| 32 | - df.printSchema(); | ||
| 33 | 26 | ||
| 34 | // cast timestamp to long | 27 | // cast timestamp to long |
| 35 | Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); | 28 | Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); |
| 36 | newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long")); | 29 | newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long")); |
| 37 | - newdf.show(); | 30 | + newdf = newdf.drop("click_time").drop("attributed_time"); |
| 31 | + | ||
| 32 | + WindowSpec w = Window.partitionBy("ip", "app") | ||
| 33 | + .orderBy("utc_click_time") | ||
| 34 | + .rowsBetween(Window.unboundedPreceding(), Window.currentRow()); | ||
| 35 | + | ||
| 36 | + newdf = newdf.withColumn("cum_count_click", count("utc_click_time").over(w)); | ||
| 37 | + newdf = newdf.withColumn("cum_sum_attributed", sum("is_attributed").over(w)); | ||
| 38 | + newdf = newdf.withColumn("avg_efficient", col("cum_sum_attributed").divide(col("cum_count_click"))); | ||
| 39 | + newdf.where("ip == '5348' and app == '19'").show(); | ||
| 40 | + newdf.printSchema(); | ||
| 41 | + | ||
| 38 | } | 42 | } |
| 39 | } | 43 | } |
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment