Showing
3 changed files
with
275 additions
and
62 deletions
src/main/java/Aggregation.java
0 → 100644
| 1 | +import org.apache.spark.api.java.function.MapFunction; | ||
| 2 | +import org.apache.spark.sql.Dataset; | ||
| 3 | +import org.apache.spark.sql.Encoders; | ||
| 4 | +import org.apache.spark.sql.Row; | ||
| 5 | +import org.apache.spark.sql.SparkSession; | ||
| 6 | +import org.apache.spark.sql.expressions.Window; | ||
| 7 | +import org.apache.spark.sql.expressions.WindowSpec; | ||
| 8 | + | ||
| 9 | +import java.util.ArrayList; | ||
| 10 | +import java.util.List; | ||
| 11 | + | ||
| 12 | +import static org.apache.spark.sql.functions.*; | ||
| 13 | +import static org.apache.spark.sql.functions.lit; | ||
| 14 | +import static org.apache.spark.sql.functions.when; | ||
| 15 | + | ||
| 16 | +public class Aggregation { | ||
| 17 | + | ||
| 18 | + public static void main(String[] args) throws Exception { | ||
| 19 | + | ||
| 20 | + //Create Session | ||
| 21 | + SparkSession spark = SparkSession | ||
| 22 | + .builder() | ||
| 23 | + .appName("Detecting Fraud Clicks") | ||
| 24 | + .master("local") | ||
| 25 | + .getOrCreate(); | ||
| 26 | + | ||
| 27 | + // Aggregation | ||
| 28 | + Aggregation agg = new Aggregation(); | ||
| 29 | + | ||
| 30 | + Dataset<Row> dataset = agg.loadCSVDataSet("/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv", spark); | ||
| 31 | + dataset = agg.changeTimestempToLong(dataset); | ||
| 32 | + dataset = agg.averageValidClickCount(dataset); | ||
| 33 | + dataset = agg.clickTimeDelta(dataset); | ||
| 34 | + dataset = agg.countClickInTenMinutes(dataset); | ||
| 35 | + | ||
| 36 | + long start = System.currentTimeMillis(); | ||
| 37 | + | ||
| 38 | + List<String> logs_with_features = dataset.map(row->row.toString(), Encoders.STRING()).collectAsList(); | ||
| 39 | + String[][] contents = new String[(int)dataset.count()][11]; | ||
| 40 | + for (int i =0; i<logs_with_features.size();i++){ | ||
| 41 | + String str_to_split = logs_with_features.get(i); | ||
| 42 | + String[] tmp = str_to_split.substring(1,str_to_split.length()-1).split(","); | ||
| 43 | + contents[i] = tmp; | ||
| 44 | + } | ||
| 45 | + | ||
| 46 | + long end = System.currentTimeMillis(); | ||
| 47 | + System.out.println("JK's Procedure time elapsed : " + (end-start)/1000.0); | ||
| 48 | + | ||
| 49 | + start = System.currentTimeMillis(); | ||
| 50 | + List<String> stringDataset = dataset.toJSON().collectAsList(); | ||
| 51 | + end = System.currentTimeMillis(); | ||
| 52 | + System.out.println("Steve's Procedure 1 time elapsed : " + (end-start)/1000.0); | ||
| 53 | + new GUI(stringDataset, contents); | ||
| 54 | + | ||
| 55 | + | ||
| 56 | + } | ||
| 57 | + | ||
| 58 | + | ||
| 59 | + private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){ | ||
| 60 | + // Read SCV to DataSet | ||
| 61 | + return spark.read().format("csv") | ||
| 62 | + .option("inferSchema", "true") | ||
| 63 | + .option("header", "true") | ||
| 64 | + .load(path); | ||
| 65 | + } | ||
| 66 | + | ||
| 67 | + private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){ | ||
| 68 | + // cast timestamp to long | ||
| 69 | + Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long")); | ||
| 70 | + newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long")); | ||
| 71 | + newDF = newDF.drop("click_time").drop("attributed_time"); | ||
| 72 | + return newDF; | ||
| 73 | + } | ||
| 74 | + | ||
| 75 | + private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){ | ||
| 76 | + // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row | ||
| 77 | + WindowSpec w = Window.partitionBy("ip", "app") | ||
| 78 | + .orderBy("utc_click_time") | ||
| 79 | + .rowsBetween(Window.unboundedPreceding(), Window.currentRow()); | ||
| 80 | + | ||
| 81 | + // aggregation | ||
| 82 | + Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w)); | ||
| 83 | + newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w)); | ||
| 84 | + newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click"))); | ||
| 85 | + newDF = newDF.drop("cum_count_click", "cum_sum_attributed"); | ||
| 86 | + return newDF; | ||
| 87 | + } | ||
| 88 | + | ||
| 89 | + private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){ | ||
| 90 | + WindowSpec w = Window.partitionBy ("ip") | ||
| 91 | + .orderBy("utc_click_time"); | ||
| 92 | + | ||
| 93 | + Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w)); | ||
| 94 | + newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(), | ||
| 95 | + lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(), | ||
| 96 | + lit(0)).otherwise(col("lag(utc_click_time)")))); | ||
| 97 | + newDF = newDF.drop("lag(utc_click_time)"); | ||
| 98 | + return newDF; | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){ | ||
| 102 | + WindowSpec w = Window.partitionBy("ip") | ||
| 103 | + .orderBy("utc_click_time") | ||
| 104 | + .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
| 105 | + | ||
| 106 | + Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins", | ||
| 107 | + (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함. | ||
| 108 | + return newDF; | ||
| 109 | + } | ||
| 110 | +} | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
src/main/java/GUI.java
0 → 100644
| 1 | +import org.apache.spark.sql.Dataset; | ||
| 2 | +import org.apache.spark.sql.Row; | ||
| 3 | + | ||
| 4 | +import javax.swing.*; | ||
| 5 | +import java.awt.*; | ||
| 6 | +import java.io.BufferedReader; | ||
| 7 | +import java.io.StringReader; | ||
| 8 | +import java.sql.ResultSet; | ||
| 9 | +import java.sql.ResultSetMetaData; | ||
| 10 | +import java.sql.Statement; | ||
| 11 | +import java.util.List; | ||
| 12 | +import java.util.Vector; | ||
| 13 | +import java.awt.BorderLayout; | ||
| 14 | +import java.awt.GridLayout; | ||
| 15 | +import java.awt.event.ActionEvent; | ||
| 16 | +import java.awt.event.ActionListener; | ||
| 17 | +import java.sql.Connection; | ||
| 18 | +import java.sql.DriverManager; | ||
| 19 | +import java.sql.ResultSet; | ||
| 20 | +import java.sql.ResultSetMetaData; | ||
| 21 | +import java.sql.Statement; | ||
| 22 | +import java.util.Vector; | ||
| 23 | + | ||
| 24 | +import javax.swing.JButton; | ||
| 25 | +import javax.swing.JFrame; | ||
| 26 | +import javax.swing.JLabel; | ||
| 27 | +import javax.swing.JPanel; | ||
| 28 | +import javax.swing.JScrollPane; | ||
| 29 | +import javax.swing.JTable; | ||
| 30 | +import javax.swing.JTextField; | ||
| 31 | +import javax.swing.table.AbstractTableModel; | ||
| 32 | +import javax.swing.table.DefaultTableModel; | ||
| 33 | + | ||
| 34 | +public class GUI extends JFrame { | ||
| 35 | + JTabbedPane tab = new JTabbedPane(); | ||
| 36 | + | ||
| 37 | + public GUI(List<String> q, String[][] data) { | ||
| 38 | + super("CESCO"); | ||
| 39 | + | ||
| 40 | + tab.addTab("png", new PngPane()); | ||
| 41 | + tab.addTab("gif", new GifPane()); | ||
| 42 | + tab.addTab("jpg", new JpgPane()); | ||
| 43 | + tab.addTab("table", new createTable(q)); | ||
| 44 | + tab.addTab("processed_features", new createTable_alter(data)); | ||
| 45 | + | ||
| 46 | + add(tab); | ||
| 47 | + | ||
| 48 | + setSize(800, 500); // 윈도우의 크기 가로x세로 | ||
| 49 | + setVisible(true); // 창을 보여줄떄 true, 숨길때 false | ||
| 50 | + setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); // x 버튼을 눌렀을때 종료 | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | +// public static void main(String args[]) { | ||
| 54 | +// new GUI(); | ||
| 55 | +// } | ||
| 56 | +} | ||
| 57 | + | ||
| 58 | +class PngPane extends JPanel { | ||
| 59 | + public PngPane() { | ||
| 60 | + super(); | ||
| 61 | + ImageIcon image = new ImageIcon("data/model.png"); | ||
| 62 | + JLabel label = new JLabel("", image, JLabel.CENTER); | ||
| 63 | + setLayout(new BorderLayout()); | ||
| 64 | + add(label, BorderLayout.CENTER); | ||
| 65 | + } | ||
| 66 | +} | ||
| 67 | + | ||
| 68 | +class GifPane extends JPanel { | ||
| 69 | + public GifPane() { | ||
| 70 | + super(); | ||
| 71 | + ImageIcon image = new ImageIcon("data/model.gif"); | ||
| 72 | + JLabel label = new JLabel("", image, JLabel.CENTER); | ||
| 73 | + setLayout(new BorderLayout()); | ||
| 74 | + add(label, BorderLayout.CENTER); | ||
| 75 | + } | ||
| 76 | +} | ||
| 77 | + | ||
| 78 | +class JpgPane extends JPanel { | ||
| 79 | + public JpgPane() { | ||
| 80 | + super(); | ||
| 81 | + ImageIcon image = new ImageIcon("data/model.jpg"); | ||
| 82 | + JLabel label = new JLabel("", image, JLabel.CENTER); | ||
| 83 | + setLayout(new BorderLayout()); | ||
| 84 | + add(label, BorderLayout.CENTER); | ||
| 85 | + } | ||
| 86 | +} | ||
| 87 | + | ||
| 88 | +class createTable_alter extends JPanel{ | ||
| 89 | + private String[] header = {"ip","app","device","os","channel","is_attributed","click_time","attributed_time", | ||
| 90 | + "avg_valid_click_count","click_time_delta","count_click_in_tenmin"}; | ||
| 91 | +/* | ||
| 92 | +root | ||
| 93 | + |-- ip: integer (nullable = true) | ||
| 94 | + |-- app: integer (nullable = true) | ||
| 95 | + |-- device: integer (nullable = true) | ||
| 96 | + |-- os: integer (nullable = true) | ||
| 97 | + |-- channel: integer (nullable = true) | ||
| 98 | + |-- is_attributed: integer (nullable = true) | ||
| 99 | + |-- utc_click_time: long (nullable = true) | ||
| 100 | + |-- utc_attributed_time: long (nullable = true) | ||
| 101 | + |-- avg_valid_click_count: double (nullable = true) | ||
| 102 | + |-- click_time_delta: long (nullable = true) | ||
| 103 | + |-- count_click_in_ten_mins: long (nullable = false) | ||
| 104 | + */ | ||
| 105 | + public createTable_alter(String[][] data){ | ||
| 106 | + JTable processed_table = new JTable(data, header); | ||
| 107 | + JScrollPane jScrollPane = new JScrollPane(processed_table); | ||
| 108 | + add(jScrollPane); | ||
| 109 | + } | ||
| 110 | +} | ||
| 111 | + | ||
| 112 | +class createTable extends JPanel { | ||
| 113 | + long start = System.currentTimeMillis(); | ||
| 114 | + public createTable(List<String> data) { //constructor : display table | ||
| 115 | + getTableModel(data); | ||
| 116 | + } | ||
| 117 | + | ||
| 118 | + private DefaultTableModel getTableModel(List<String> data) { | ||
| 119 | + String column_n[]={"ip","app","device","os","channel","is_attributed","click_time", | ||
| 120 | + "avg_valid_click_count","click_time_delta","count_click_in_tenmin"}; | ||
| 121 | + Object tabledata[][]={}; | ||
| 122 | + DefaultTableModel model = new DefaultTableModel(tabledata,column_n); | ||
| 123 | + JTable jtable = new JTable(model); | ||
| 124 | + JScrollPane jScollPane = new JScrollPane(jtable); | ||
| 125 | + add(jScollPane); | ||
| 126 | + try { | ||
| 127 | + for(int i =0; i<data.size();i++){ | ||
| 128 | + BufferedReader reader = getFileReader(data.get(i)); | ||
| 129 | + String line = reader.readLine(); | ||
| 130 | + | ||
| 131 | + | ||
| 132 | + line = line.replace("\"", ""); | ||
| 133 | + line = line.replace("_", ""); | ||
| 134 | + //line = line.replace("\\{",""); | ||
| 135 | + line = line.replaceAll("\\{|\\}",""); | ||
| 136 | + line = line.replaceAll("\\w+:", ""); | ||
| 137 | + | ||
| 138 | + //System.out.println(line); | ||
| 139 | + Object [] temp= line.split(","); | ||
| 140 | + | ||
| 141 | + model.addRow(temp); | ||
| 142 | + | ||
| 143 | + reader.close(); | ||
| 144 | + } | ||
| 145 | + | ||
| 146 | + } catch (Exception e) { | ||
| 147 | + System.out.println(e); | ||
| 148 | + } | ||
| 149 | + long end = System.currentTimeMillis(); | ||
| 150 | + System.out.println("Steve's Procedure2 time elapsed : " + (end-start)/1000.0); | ||
| 151 | + | ||
| 152 | + return model; | ||
| 153 | + } | ||
| 154 | + | ||
| 155 | + private BufferedReader getFileReader(String data) { | ||
| 156 | + | ||
| 157 | + BufferedReader reader = new BufferedReader(new StringReader(data)); | ||
| 158 | + | ||
| 159 | + // In your real application the data would come from a file | ||
| 160 | + | ||
| 161 | + //Reader reader = new BufferedReader( new FileReader(...) ); | ||
| 162 | + | ||
| 163 | + return reader; | ||
| 164 | + } | ||
| 165 | +} | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | -import org.apache.spark.SparkConf; | ||
| 2 | -import org.apache.spark.api.java.JavaSparkContext; | ||
| 3 | -import org.apache.spark.sql.Dataset; | ||
| 4 | -import org.apache.spark.sql.Row; | ||
| 5 | -import org.apache.spark.sql.SparkSession; | ||
| 6 | -import org.apache.spark.sql.expressions.Window; | ||
| 7 | -import org.apache.spark.sql.expressions.WindowSpec; | ||
| 8 | - | ||
| 9 | -import javax.xml.crypto.Data; | ||
| 10 | - | ||
| 11 | -import static org.apache.spark.sql.functions.*; | ||
| 12 | - | ||
| 13 | -public class calForwardTimeDelta { | ||
| 14 | - static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco"); | ||
| 15 | - static JavaSparkContext sc = new JavaSparkContext(conf); | ||
| 16 | - | ||
| 17 | - public static void main(String[] args) throws Exception{ | ||
| 18 | - //Create Session | ||
| 19 | - SparkSession spark = SparkSession | ||
| 20 | - .builder() | ||
| 21 | - .appName("Detecting Fraud Clicks") | ||
| 22 | - .getOrCreate(); | ||
| 23 | - | ||
| 24 | - //run methods here | ||
| 25 | - calcDelta(spark); | ||
| 26 | - } | ||
| 27 | - | ||
| 28 | - private static void calcDelta(SparkSession spark){ | ||
| 29 | - // put the path the file you gonna deal with being placed | ||
| 30 | - String filepath = "/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv"; | ||
| 31 | - | ||
| 32 | - // create Dataset from files | ||
| 33 | - Dataset<Row> logDF = spark.read() | ||
| 34 | - .format("csv") | ||
| 35 | - .option("inferSchema", "true") | ||
| 36 | - .option("header","true") | ||
| 37 | - .load(filepath); | ||
| 38 | - | ||
| 39 | - // cast timestamp(click_time, attributed_time) type to long type | ||
| 40 | - | ||
| 41 | - //add column for long(click_time) | ||
| 42 | - Dataset<Row> newDF = logDF.withColumn("utc_click_time", logDF.col("click_time").cast("long")); | ||
| 43 | - //add column for long(attributed_time) | ||
| 44 | - newDF = newDF.withColumn("utc_attributed_time", logDF.col("attributed_time").cast("long")); | ||
| 45 | - //drop timestamp type columns | ||
| 46 | - newDF = newDF.drop("click_time").drop("attributed_time"); | ||
| 47 | - newDF.createOrReplaceTempView("logs"); | ||
| 48 | - | ||
| 49 | - WindowSpec w = Window.partitionBy ("ip") | ||
| 50 | - .orderBy("utc_click_time"); | ||
| 51 | - | ||
| 52 | - newDF = newDF.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w)); | ||
| 53 | - newDF.where("ip=10").show(); | ||
| 54 | - newDF = newDF.withColumn("delta", when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("lag(utc_click_time)")))); | ||
| 55 | - //newDF = newDF.withColumn("delta", datediff()); | ||
| 56 | - newDF = newDF.drop("lag(utc_click_time)"); | ||
| 57 | - newDF = newDF.orderBy("ip"); | ||
| 58 | - | ||
| 59 | - newDF.show(); | ||
| 60 | - } | ||
| 61 | - | ||
| 62 | -} |
-
Please register or login to post a comment