Merge pull request #17 from Java-Cesco/feature/tenMinsHG

Feature/ten mins hg

Merge pull request #17 from Java-Cesco/feature/tenMinsHG
Feature/ten mins hg
신은섭(Shin Eun Seop) · GitHub
Commit a0aa94695f74ac72c776bd79cb5346dcc8a870f3 a0aa9469 2 parents 41a22842 27b69035
Showing 6 changed files with 58 additions and 6 deletions
.idea/.name
.idea/Detecting_fraud_clicks.iml
data/train.csv
data/train_.csv
pom.xml
src/main/java/CountTen.java
--- a/.idea/.name 0 → 100644
View file @a0aa946
+++ b/.idea/.name 0 → 100644
View file @a0aa946
+ Detecting_fraud_clicks
\ No newline at end of file
--- a/.idea/Detecting_fraud_clicks.iml
View file @a0aa946
+++ b/.idea/Detecting_fraud_clicks.iml
View file @a0aa946
@@ -172,4 +172,4 @@
     <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
     <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
   </component>
- </module>
\ No newline at end of file
+ </module>
--- a/data/train.csv 0 → 100644
View file @a0aa946
+++ b/data/train.csv 0 → 100644
View file @a0aa946
--- a/data/train_.csv 0 → 100644
View file @a0aa946
+++ b/data/train_.csv 0 → 100644
View file @a0aa946
+ ip,app,device,os,channel,click_time,attributed_time,is_attributed
+ 117898,12,1,13,497,2017-11-07 09:30:38,,0
+ 117898,12,1,13,497,2017-11-07 09:30:38,,0
+ 117898,12,1,13,497,2017-11-07 09:31:38,,0
+ 117898,12,1,13,497,2017-11-07 09:31:38,,0
+ 117898,12,1,13,497,2017-11-07 09:31:38,,0
+ 117898,12,1,13,497,2017-11-07 09:39:38,,0
+ 117898,12,1,13,497,2017-11-07 09:40:38,,0
\ No newline at end of file
--- a/pom.xml
View file @a0aa946
+++ b/pom.xml
View file @a0aa946
@@ -16,19 +16,21 @@
             <artifactId>spark-core_2.11</artifactId>
             <version>2.3.0</version>
         </dependency>
- 
+         <dependency>
+             <groupId>org.apache.spark</groupId>
+             <artifactId>spark-sql_2.11</artifactId>
+             <version>2.2.0</version>
+         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_2.11</artifactId>
             <version>2.3.0</version>
         </dependency>
- 
         <dependency>
             <groupId>com.databricks</groupId>
             <artifactId>spark-csv_2.11</artifactId>
             <version>1.5.0</version>
         </dependency>
- 
     </dependencies>
 
     <build>
@@ -44,6 +46,5 @@
             </plugin>
         </plugins>
     </build>
- 
- 
+     
 </project>
\ No newline at end of file
--- a/src/main/java/CountTen.java 0 → 100644
View file @a0aa946
+++ b/src/main/java/CountTen.java 0 → 100644
View file @a0aa946
+ import org.apache.spark.sql.Column;
+ import org.apache.spark.sql.Dataset;
+ import org.apache.spark.sql.Row;
+ import org.apache.spark.sql.SparkSession;
+ import org.apache.spark.sql.expressions.Window;
+ import org.apache.spark.sql.expressions.WindowSpec;
+ 
+ import static org.apache.spark.sql.functions.*;
+ 
+ 
+ public class CountTen {
+ 
+     public static void main(String[] args) throws Exception {
+         SparkSession spark = SparkSession
+                 .builder()
+                 .master("local")
+                 .appName("Java Spark SQL basic example")
+                 .getOrCreate();
+ 
+         Dataset<Row> df = spark.read().format("csv")
+                 .option("inferSchema", "true")
+                 .option("header", "true")
+                 .load("./data/train_.csv");
+ 
+         // cast timestamp to long
+         Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
+         newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
+         newdf = newdf.drop("click_time").drop("attributed_time");
+ 
+         WindowSpec w = Window.partitionBy("ip")
+                 .orderBy("utc_click_time")
+                 .rangeBetween(Window.currentRow(),Window.currentRow()+600);
+ //                .rowsBetween(Window.currentRow(), Window.unboundedPreceding());   //Boundary end is not a valid integer: -9223372036854775808
+ 
+         newdf = newdf.withColumn("is_clicked_in_ten_mins",
+                 (count("utc_click_time").over(w)).minus(1));    //본인것 포함할 것인지 정해야함.
+ //        newdf = newdf.withColumn("is_clicked_in_ten_mins",
+ //                (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
+ 
+         newdf.where("ip == '117898'").show(false);
+     }
+ }
\ No newline at end of file