신은섭(Shin Eun Seop)

Merge branch 'feature/#3' into feature/tenMinsHG

File mode changed
This diff is collapsed. Click to expand it.
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="MarkdownExportedFiles">
4 + <htmlFiles />
5 + <imageFiles />
6 + <otherFiles />
7 + </component>
8 +</project>
...\ No newline at end of file ...\ No newline at end of file
...@@ -8,7 +8,17 @@ ...@@ -8,7 +8,17 @@
8 </list> 8 </list>
9 </option> 9 </option>
10 </component> 10 </component>
11 - <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK"> 11 + <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" project-jdk-name="1.8" project-jdk-type="JavaSDK">
12 <output url="file://$PROJECT_DIR$/out" /> 12 <output url="file://$PROJECT_DIR$/out" />
13 </component> 13 </component>
14 + <component name="MavenProjectsManager">
15 + <option name="originalFiles">
16 + <list>
17 + <option value="$PROJECT_DIR$/pom.xml" />
18 + </list>
19 + </option>
20 + </component>
21 + <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK">
22 + <output url="file:///tmp" />
23 + </component>
14 </project> 24 </project>
...\ No newline at end of file ...\ No newline at end of file
......
File mode changed
File mode changed
...@@ -31,7 +31,20 @@ ...@@ -31,7 +31,20 @@
31 <artifactId>spark-csv_2.11</artifactId> 31 <artifactId>spark-csv_2.11</artifactId>
32 <version>1.5.0</version> 32 <version>1.5.0</version>
33 </dependency> 33 </dependency>
34 -
35 </dependencies> 34 </dependencies>
36 35
36 + <build>
37 + <plugins>
38 + <plugin>
39 + <groupId>org.apache.maven.plugins</groupId>
40 + <artifactId>maven-compiler-plugin</artifactId>
41 + <version>3.6.1</version>
42 + <configuration>
43 + <source>1.8</source>
44 + <target>1.8</target>
45 + </configuration>
46 + </plugin>
47 + </plugins>
48 + </build>
49 +
37 </project> 50 </project>
...\ No newline at end of file ...\ No newline at end of file
......
1 +import org.apache.spark.sql.Dataset;
2 +import org.apache.spark.sql.Row;
3 +import org.apache.spark.sql.SparkSession;
4 +import org.apache.spark.sql.expressions.Window;
5 +import org.apache.spark.sql.expressions.WindowSpec;
6 +
7 +import static org.apache.spark.sql.functions.*;
8 +import static org.apache.spark.sql.functions.lit;
9 +import static org.apache.spark.sql.functions.when;
10 +
11 +public class Aggregation {
12 +
13 + public static void main(String[] args) throws Exception {
14 +
15 + //Create Session
16 + SparkSession spark = SparkSession
17 + .builder()
18 + .appName("Detecting Fraud Clicks")
19 + .master("local")
20 + .getOrCreate();
21 +
22 + Aggregation agg = new Aggregation();
23 +
24 + Dataset<Row> dataset = agg.loadCSVDataSet("./train_sample.csv", spark);
25 + dataset = agg.changeTimestempToLong(dataset);
26 + dataset = agg.averageValidClickCount(dataset);
27 + dataset = agg.clickTimeDelta(dataset);
28 +
29 + dataset.where("ip == '5348' and app == '19'").show();
30 +
31 + }
32 +
33 +
34 + private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
35 + // Read SCV to DataSet
36 + Dataset<Row> dataset = spark.read().format("csv")
37 + .option("inferSchema", "true")
38 + .option("header", "true")
39 + .load("train_sample.csv");
40 + return dataset;
41 + }
42 +
43 + private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
44 + // cast timestamp to long
45 + Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
46 + newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
47 + newDF = newDF.drop("click_time").drop("attributed_time");
48 + return newDF;
49 + }
50 +
51 + private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
52 + // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
53 + WindowSpec w = Window.partitionBy("ip", "app")
54 + .orderBy("utc_click_time")
55 + .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
56 +
57 + // aggregation
58 + Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
59 + newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
60 + newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
61 + newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
62 + return newDF;
63 + }
64 +
65 + private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
66 + WindowSpec w = Window.partitionBy ("ip")
67 + .orderBy("utc_click_time");
68 +
69 + Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
70 + newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
71 + lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
72 + lit(0)).otherwise(col("lag(utc_click_time)"))));
73 + newDF = newDF.drop("lag(utc_click_time)");
74 + return newDF;
75 + }
76 +}
1 +import org.apache.spark.sql.Dataset;
2 +import org.apache.spark.sql.Row;
3 +import org.apache.spark.sql.SparkSession;
4 +import org.apache.spark.sql.expressions.Window;
5 +import org.apache.spark.sql.expressions.WindowSpec;
6 +
7 +import static org.apache.spark.sql.functions.col;
8 +import static org.apache.spark.sql.functions.count;
9 +import static org.apache.spark.sql.functions.sum;
10 +
11 +
12 +public class AvgAdvTime {
13 +
14 + public static void main(String[] args) throws Exception {
15 +
16 + // Start Spark Session
17 + SparkSession spark = SparkSession
18 + .builder()
19 + .master("local")
20 + .appName("Java Spark SQL basic example")
21 + .getOrCreate();
22 +
23 + // Read SCV to DataSet
24 + Dataset<Row> df = spark.read().format("csv")
25 + .option("inferSchema", "true")
26 + .option("header", "true")
27 + .load("train_sample.csv");
28 +
29 + // cast timestamp to long
30 + Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
31 + newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
32 + newdf = newdf.drop("click_time").drop("attributed_time");
33 +
34 + // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
35 + WindowSpec w = Window.partitionBy("ip", "app")
36 + .orderBy("utc_click_time")
37 + .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
38 +
39 + // aggregation
40 + newdf = newdf.withColumn("cum_count_click", count("utc_click_time").over(w));
41 + newdf = newdf.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
42 + newdf = newdf.withColumn("avg_efficient", col("cum_sum_attributed").divide(col("cum_count_click")));
43 +
44 + // print example
45 + newdf.where("ip == '5348' and app == '19'").show();
46 + newdf.printSchema();
47 +
48 + }
49 +}
...\ No newline at end of file ...\ No newline at end of file
1 -import java.text.ParseException;
2 -import java.text.SimpleDateFormat;
3 -import java.util.Calendar;
4 -
5 -/**
6 - * Calendar 객체 관련 기능들을 모아놓은 유틸리티 클래스
7 - *
8 - * @author croute
9 - * @since 2011.02.10
10 - */
11 -public class DateUtil
12 -{
13 -
14 - /**
15 - * 캘린더 객체를 yyyy-MM-dd HH:mm:ss 형태의 문자열로 변환합니다.
16 - *
17 - * @param cal 캘린더 객체
18 - * @return 변환된 문자열
19 - */
20 - public static String StringFromCalendar(Calendar cal)
21 - {
22 - // 날짜를 통신용 문자열로 변경
23 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
24 - return formatter.format(cal.getTime());
25 - }
26 -
27 - /**
28 - * 캘린더 객체를 yyyy-MM-dd형태의 문자열로 변환합니다.
29 - *
30 - * @param cal 캘린더 객체
31 - * @return 변환된 문자열
32 - */
33 - public static String StringSimpleFromCalendar(Calendar cal)
34 - {
35 - // 날짜를 통신용 문자열로 변경
36 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
37 - return formatter.format(cal.getTime());
38 - }
39 -
40 - /**
41 - * yyyy-MM-dd HH:mm:ss 형태의 문자열을 캘린더 객체로 변환합니다.
42 - * 만약 변환에 실패할 경우 오늘 날짜를 반환합니다.
43 - *
44 - * @param date 날짜를 나타내는 문자열
45 - * @return 변환된 캘린더 객체
46 - */
47 - public static Calendar CalendarFromString(String date)
48 - {
49 - if (date.length() == 0)
50 - return null;
51 - Calendar cal = Calendar.getInstance();
52 - try
53 - {
54 - //String oldstring = "2011-01-18 00:00:00.0";
55 - // Date date = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S").parse(oldstring);
56 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
57 - cal.setTime(formatter.parse(date));
58 - }
59 - catch(ParseException e)
60 - {
61 - e.printStackTrace();
62 - }
63 - return cal;
64 - }
65 -
66 - /**
67 - * yyyy-MM-dd 형태의 문자열을 캘린더 객체로 변환합니다.
68 - * 만약 변환에 실패할 경우 오늘 날짜를 반환합니다.
69 - *
70 - * @param date 날짜를 나타내는 문자열
71 - * @return 변환된 캘린더 객체
72 - */
73 - public static Calendar CalendarFromStringSimple(String date)
74 - {
75 - Calendar cal = Calendar.getInstance();
76 -
77 - try
78 - {
79 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
80 - cal.setTime(formatter.parse(date));
81 - }
82 - catch(ParseException e)
83 - {
84 - e.printStackTrace();
85 - }
86 - return cal;
87 - }
88 -}
...\ No newline at end of file ...\ No newline at end of file
1 -import org.apache.spark.SparkConf;
2 -import org.apache.spark.api.java.JavaPairRDD;
3 -import org.apache.spark.api.java.JavaRDD;
4 -import org.apache.spark.api.java.JavaSparkContext;
5 -import org.apache.spark.api.java.function.Function;
6 -import org.apache.spark.sql.Dataset;
7 -import org.apache.spark.sql.Row;
8 -import org.apache.spark.sql.SQLContext;
9 -import org.apache.spark.sql.SparkSession;
10 -import org.apache.spark.sql.types.StructType;
11 -import scala.Serializable;
12 -import scala.Tuple2;
13 -
14 -import java.util.*;
15 -
16 -//ip,app,device,os,channel,click_time,attributed_time,is_attributed
17 -//87540,12,1,13,497,2017-11-07 09:30:38,,0
18 -class Record implements Serializable {
19 - Integer ip;
20 - Integer app;
21 - Integer device;
22 - Integer os;
23 - Integer channel;
24 - Calendar clickTime;
25 - Calendar attributedTime;
26 - Boolean isAttributed;
27 - Integer clickInTenMins;
28 -
29 - // constructor , getters and setters
30 - public Record(int pIp, int pApp, int pDevice, int pOs, int pChannel, Calendar pClickTime, Calendar pAttributedTime, boolean pIsAttributed) {
31 - ip = new Integer(pIp);
32 - app = new Integer(pApp);
33 - device = new Integer(pDevice);
34 - os = new Integer(pOs);
35 - channel = new Integer(pChannel);
36 - clickTime = pClickTime;
37 - attributedTime = pAttributedTime;
38 - isAttributed = new Boolean(pIsAttributed);
39 - clickInTenMins = new Integer(0);
40 - }
41 -
42 - public Record(int pIp, int pApp, int pDevice, int pOs, int pChannel, Calendar pClickTime, Calendar pAttributedTime, boolean pIsAttributed, int pClickInTenMins) {
43 - ip = new Integer(pIp);
44 - app = new Integer(pApp);
45 - device = new Integer(pDevice);
46 - os = new Integer(pOs);
47 - channel = new Integer(pChannel);
48 - clickTime = pClickTime;
49 - attributedTime = pAttributedTime;
50 - isAttributed = new Boolean(pIsAttributed);
51 - clickInTenMins = new Integer(pClickInTenMins);
52 - }
53 -}
54 -
55 -class RecordComparator implements Comparator<Record> {
56 - @Override
57 - public int compare(Record v1 , Record v2) {
58 -// if(a.ano < b.ano) return -1;
59 -// else if(a.ano == b.ano) return 0;
60 -// else return 1;
61 - if (v1.ip.compareTo(v2.ip) == 0) {
62 - return v1.clickTime.compareTo(v2.clickTime);
63 - }
64 - return v1.ip.compareTo(v2.ip);
65 - }
66 -}
67 -
68 -public class MapExample {
69 -
70 - static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco");
71 - static JavaSparkContext sc = new JavaSparkContext(conf);
72 - static SQLContext sqlContext = new SQLContext(sc);
73 -
74 - public static void main(String[] args) throws Exception {
75 - JavaRDD<String> file = sc.textFile("/Users/hyeongyunmun/Dropbox/DetectFraudClick/data/train.csv", 1);
76 -
77 - final String header = file.first();
78 - JavaRDD<String> data = file.filter(line -> !line.equalsIgnoreCase(header));
79 -
80 - JavaRDD<Record> records = data.map(line -> {
81 - String[] fields = line.split(",");
82 - Record sd = new Record(Integer.parseInt(fields[0]), Integer.parseInt(fields[1]), Integer.parseInt(fields[2]), Integer.parseInt(fields[3]), Integer.parseInt(fields[4]), DateUtil.CalendarFromString(fields[5]), DateUtil.CalendarFromString(fields[6]), "1".equalsIgnoreCase(fields[7].trim()));
83 - return sd;
84 - });
85 -
86 -// JavaRDD<Tuple4<Integer,Double,Long,Integer>> secondSortRDD = firstSortRDD.keyBy(new Function<Tuple4<Integer, Double, Long, Integer>, Tuple2<Double, Long>>(){
87 -// @Override
88 -// public Tuple2<Double, Long> call(Tuple4<Integer, Double, Long, Integer> value) throws Exception {
89 -// return new Tuple2(value._2(),value._3());
90 -// }}).sortByKey(new TupleComparator()).values();
91 -
92 - JavaRDD<Record> firstSorted = records.sortBy(new Function<Record, Calendar>() {
93 - @Override
94 - public Calendar call(Record record) throws Exception {
95 - return record.clickTime;
96 - }
97 - }, true, 1);
98 -
99 - JavaRDD<Record> sortedRecords = firstSorted.sortBy(new Function<Record, Integer>() {
100 - @Override
101 - public Integer call(Record record) throws Exception {
102 - return record.ip.intValue();
103 - }
104 - }, true, 1);
105 -
106 -
107 - /*
108 - //두개를 한번에 정렬해보려 했지만 실패
109 - JavaRDD<Record> sortedRecords = records.keyBy(new Function<Record, Record>(){
110 - @Override
111 - public Record call(Record record) throws Exception {
112 - return new Record(record.ip, record.app, record.device, record.os, record.channel, record.clickTime, record.attributedTime, record.isAttributed);
113 - }}).sortByKey(new RecordComparator()).values();
114 - */
115 -
116 -// System.out.println("sortedRecords");
117 -// sortedRecords.foreach(record -> {System.out.println(record.ip + " " + record.clickTime.getTime());});
118 -
119 -// System.out.println("make result");
120 - /*
121 - //map의 다음것을 가져오려했지만 실패
122 - JavaRDD<Record> result = sortedRecords.map(record -> {
123 - System.out.println("make addTen");
124 - Calendar addTen = Calendar.getInstance();
125 - addTen.setTime(record.clickTime.getTime());
126 - addTen.add(Calendar.MINUTE, 10);
127 -
128 - System.out.println("make count");
129 - int count = 0;
130 - for (Record temp: sortedRecords.collect()) {
131 - if (temp.ip.compareTo(record.ip) == 0 && temp.clickTime.compareTo(record.clickTime) > 0 && temp.clickTime.compareTo(addTen)< 0)
132 - count++;
133 - }
134 -
135 - return new Record(record.ip, record.app, record.device, record.os, record.channel, record.clickTime, record.attributedTime, record.isAttributed, count);
136 - });
137 - */
138 -// System.out.println("result");
139 -// result.foreach(record -> {System.out.println(record.ip + " " + record.clickTime.getTime());});
140 -
141 - /*
142 -
143 - for (final ListIterator<String> it = list.listIterator(); it.hasNext();) {
144 - final String s = it.next();
145 - System.out.println(it.previousIndex() + ": " + s);
146 - }
147 -
148 - for (ListIterator<Record> it = sortedRecords.collect().listIterator(); it.hasNext(); it = it.nextIndex()) {
149 - it.
150 - if (temp.ip.compareTo(record.ip) == 0 && temp.clickTime.compareTo(record.clickTime) > 0 && temp.clickTime.compareTo(addTen)< 0)
151 - count++;
152 - }
153 - */
154 -
155 -
156 - List<Record> list = sortedRecords.collect();
157 -
158 - List<Record> resultList = new ArrayList<Record>();
159 - for (int i = 0; i < list.size(); i++) {
160 - //System.out.println(list.get(i).ip);
161 -
162 - Record record = list.get(i);
163 -
164 - Calendar addTen = Calendar.getInstance();
165 - addTen.setTime(record.clickTime.getTime());
166 - addTen.add(Calendar.MINUTE, 10);
167 -
168 - int count = 0;
169 -
170 - for (int j = i+1; j < list.size() && list.get(j).ip.compareTo(record.ip) == 0
171 - && list.get(j).clickTime.compareTo(record.clickTime) > 0 &&list.get(j).clickTime.compareTo(addTen) < 0; j++)
172 - count++;
173 -
174 - resultList.add(new Record(record.ip, record.app, record.device, record.os, record.channel, record.clickTime, record.attributedTime, record.isAttributed, count));
175 -
176 - }
177 -
178 -
179 - JavaRDD<Record> result = sc.parallelize(resultList);
180 - result.foreach(record -> {System.out.println(record.ip + " " + record.clickTime.getTime() + " " + record.clickInTenMins);});
181 -
182 - }
183 -}
1 +import org.apache.spark.SparkConf;
2 +import org.apache.spark.api.java.JavaSparkContext;
3 +import org.apache.spark.sql.Dataset;
4 +import org.apache.spark.sql.Row;
5 +import org.apache.spark.sql.SparkSession;
6 +import org.apache.spark.sql.expressions.Window;
7 +import org.apache.spark.sql.expressions.WindowSpec;
8 +
9 +import javax.xml.crypto.Data;
10 +
11 +import static org.apache.spark.sql.functions.*;
12 +
13 +public class calForwardTimeDelta {
14 + static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco");
15 + static JavaSparkContext sc = new JavaSparkContext(conf);
16 +
17 + public static void main(String[] args) throws Exception{
18 + //Create Session
19 + SparkSession spark = SparkSession
20 + .builder()
21 + .appName("Detecting Fraud Clicks")
22 + .getOrCreate();
23 +
24 + //run methods hereu
25 + calcDelta(spark);
26 + }
27 +
28 + private static void calcDelta(SparkSession spark){
29 + // put the path the file you gonna deal with being placed
30 + String filepath = "train_sample.csv";
31 +
32 + // create Dataset from files
33 + Dataset<Row> logDF = spark.read()
34 + .format("csv")
35 + .option("inferSchema", "true")
36 + .option("header","true")
37 + .load(filepath);
38 +
39 + // cast timestamp(click_time, attributed_time) type to long type
40 +
41 + //add column for long(click_time)
42 + Dataset<Row> newDF = logDF.withColumn("utc_click_time", logDF.col("click_time").cast("long"));
43 + //add column for long(attributed_time)
44 + newDF = newDF.withColumn("utc_attributed_time", logDF.col("attributed_time").cast("long"));
45 + //drop timestamp type columns
46 + newDF = newDF.drop("click_time").drop("attributed_time");
47 + newDF.createOrReplaceTempView("logs");
48 +
49 + WindowSpec w = Window.partitionBy ("ip")
50 + .orderBy("utc_click_time");
51 +
52 + newDF = newDF.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
53 + newDF.where("ip=10").show();
54 + newDF = newDF.withColumn("delta", when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("lag(utc_click_time)"))));
55 + //newDF = newDF.withColumn("delta", datediff());
56 + newDF = newDF.drop("lag(utc_click_time)");
57 + newDF = newDF.orderBy("ip");
58 +
59 + newDF.show();
60 + }
61 +
62 +}
1 -public class valid {
2 - private int x;
3 -
4 - valid() {
5 - x = 0;
6 - }
7 -
8 - void printX(){
9 - System.out.println(x);
10 - }
11 -
12 - public static void main(String[] args){
13 - valid v = new valid();
14 - v.printX();
15 - }
16 -
17 -}
File mode changed
This diff could not be displayed because it is too large.