Committed by
GitHub
Merge pull request #17 from Java-Cesco/feature/tenMinsHG
Feature/ten mins hg
Showing
6 changed files
with
58 additions
and
6 deletions
.idea/.name
0 → 100644
| 1 | +Detecting_fraud_clicks | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -172,4 +172,4 @@ | ... | @@ -172,4 +172,4 @@ |
| 172 | <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" /> | 172 | <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" /> |
| 173 | <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" /> | 173 | <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" /> |
| 174 | </component> | 174 | </component> |
| 175 | -</module> | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 175 | +</module> | ... | ... |
data/train.csv
0 → 100644
This diff could not be displayed because it is too large.
data/train_.csv
0 → 100644
| 1 | +ip,app,device,os,channel,click_time,attributed_time,is_attributed | ||
| 2 | +117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
| 3 | +117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
| 4 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
| 5 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
| 6 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
| 7 | +117898,12,1,13,497,2017-11-07 09:39:38,,0 | ||
| 8 | +117898,12,1,13,497,2017-11-07 09:40:38,,0 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -16,19 +16,21 @@ | ... | @@ -16,19 +16,21 @@ |
| 16 | <artifactId>spark-core_2.11</artifactId> | 16 | <artifactId>spark-core_2.11</artifactId> |
| 17 | <version>2.3.0</version> | 17 | <version>2.3.0</version> |
| 18 | </dependency> | 18 | </dependency> |
| 19 | - | 19 | + <dependency> |
| 20 | + <groupId>org.apache.spark</groupId> | ||
| 21 | + <artifactId>spark-sql_2.11</artifactId> | ||
| 22 | + <version>2.2.0</version> | ||
| 23 | + </dependency> | ||
| 20 | <dependency> | 24 | <dependency> |
| 21 | <groupId>org.apache.spark</groupId> | 25 | <groupId>org.apache.spark</groupId> |
| 22 | <artifactId>spark-sql_2.11</artifactId> | 26 | <artifactId>spark-sql_2.11</artifactId> |
| 23 | <version>2.3.0</version> | 27 | <version>2.3.0</version> |
| 24 | </dependency> | 28 | </dependency> |
| 25 | - | ||
| 26 | <dependency> | 29 | <dependency> |
| 27 | <groupId>com.databricks</groupId> | 30 | <groupId>com.databricks</groupId> |
| 28 | <artifactId>spark-csv_2.11</artifactId> | 31 | <artifactId>spark-csv_2.11</artifactId> |
| 29 | <version>1.5.0</version> | 32 | <version>1.5.0</version> |
| 30 | </dependency> | 33 | </dependency> |
| 31 | - | ||
| 32 | </dependencies> | 34 | </dependencies> |
| 33 | 35 | ||
| 34 | <build> | 36 | <build> |
| ... | @@ -44,6 +46,5 @@ | ... | @@ -44,6 +46,5 @@ |
| 44 | </plugin> | 46 | </plugin> |
| 45 | </plugins> | 47 | </plugins> |
| 46 | </build> | 48 | </build> |
| 47 | - | 49 | + |
| 48 | - | ||
| 49 | </project> | 50 | </project> |
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
src/main/java/CountTen.java
0 → 100644
| 1 | +import org.apache.spark.sql.Column; | ||
| 2 | +import org.apache.spark.sql.Dataset; | ||
| 3 | +import org.apache.spark.sql.Row; | ||
| 4 | +import org.apache.spark.sql.SparkSession; | ||
| 5 | +import org.apache.spark.sql.expressions.Window; | ||
| 6 | +import org.apache.spark.sql.expressions.WindowSpec; | ||
| 7 | + | ||
| 8 | +import static org.apache.spark.sql.functions.*; | ||
| 9 | + | ||
| 10 | + | ||
| 11 | +public class CountTen { | ||
| 12 | + | ||
| 13 | + public static void main(String[] args) throws Exception { | ||
| 14 | + SparkSession spark = SparkSession | ||
| 15 | + .builder() | ||
| 16 | + .master("local") | ||
| 17 | + .appName("Java Spark SQL basic example") | ||
| 18 | + .getOrCreate(); | ||
| 19 | + | ||
| 20 | + Dataset<Row> df = spark.read().format("csv") | ||
| 21 | + .option("inferSchema", "true") | ||
| 22 | + .option("header", "true") | ||
| 23 | + .load("./data/train_.csv"); | ||
| 24 | + | ||
| 25 | + // cast timestamp to long | ||
| 26 | + Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); | ||
| 27 | + newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long")); | ||
| 28 | + newdf = newdf.drop("click_time").drop("attributed_time"); | ||
| 29 | + | ||
| 30 | + WindowSpec w = Window.partitionBy("ip") | ||
| 31 | + .orderBy("utc_click_time") | ||
| 32 | + .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
| 33 | +// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808 | ||
| 34 | + | ||
| 35 | + newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
| 36 | + (count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함. | ||
| 37 | +// newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
| 38 | +// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long")); | ||
| 39 | + | ||
| 40 | + newdf.where("ip == '117898'").show(false); | ||
| 41 | + } | ||
| 42 | +} | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment