KimchiSoup(junu)

Merge branch 'ml' of https://github.com/Java-Cesco/Detecting_fraud_clicks into feauture/GUI_2

......@@ -75,3 +75,8 @@ fabric.properties
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
# datafile
train.zip
train.csv
\ No newline at end of file
......
Detecting_fraud_clicks
\ No newline at end of file
This diff is collapsed. Click to expand it.
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MarkdownExportedFiles">
<htmlFiles />
<imageFiles />
<otherFiles />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MarkdownProjectSettings">
<PreviewSettings splitEditorLayout="SPLIT" splitEditorPreview="PREVIEW" useGrayscaleRendering="false" zoomFactor="1.0" maxImageWidth="0" showGitHubPageIfSynced="false" allowBrowsingInPreview="false" synchronizePreviewPosition="true" highlightPreviewType="NONE" highlightFadeOut="5" highlightOnTyping="true" synchronizeSourcePosition="true" verticallyAlignSourceAndPreviewSyncPosition="true" showSearchHighlightsInPreview="false" showSelectionInPreview="true">
<PanelProvider>
<provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.panel" providerName="Default - Swing" />
</PanelProvider>
</PreviewSettings>
<ParserSettings gitHubSyntaxChange="false">
<PegdownExtensions>
<option name="ABBREVIATIONS" value="false" />
<option name="ANCHORLINKS" value="true" />
<option name="ASIDE" value="false" />
<option name="ATXHEADERSPACE" value="true" />
<option name="AUTOLINKS" value="true" />
<option name="DEFINITIONS" value="false" />
<option name="DEFINITION_BREAK_DOUBLE_BLANK_LINE" value="false" />
<option name="FENCED_CODE_BLOCKS" value="true" />
<option name="FOOTNOTES" value="false" />
<option name="HARDWRAPS" value="false" />
<option name="HTML_DEEP_PARSER" value="false" />
<option name="INSERTED" value="false" />
<option name="QUOTES" value="false" />
<option name="RELAXEDHRULES" value="true" />
<option name="SMARTS" value="false" />
<option name="STRIKETHROUGH" value="true" />
<option name="SUBSCRIPT" value="false" />
<option name="SUPERSCRIPT" value="false" />
<option name="SUPPRESS_HTML_BLOCKS" value="false" />
<option name="SUPPRESS_INLINE_HTML" value="false" />
<option name="TABLES" value="true" />
<option name="TASKLISTITEMS" value="true" />
<option name="TOC" value="false" />
<option name="WIKILINKS" value="true" />
</PegdownExtensions>
<ParserOptions>
<option name="COMMONMARK_LISTS" value="true" />
<option name="DUMMY" value="false" />
<option name="EMOJI_SHORTCUTS" value="true" />
<option name="FLEXMARK_FRONT_MATTER" value="false" />
<option name="GFM_LOOSE_BLANK_LINE_AFTER_ITEM_PARA" value="false" />
<option name="GFM_TABLE_RENDERING" value="true" />
<option name="GITBOOK_URL_ENCODING" value="false" />
<option name="GITHUB_EMOJI_URL" value="false" />
<option name="GITHUB_LISTS" value="false" />
<option name="GITHUB_WIKI_LINKS" value="true" />
<option name="JEKYLL_FRONT_MATTER" value="false" />
<option name="SIM_TOC_BLANK_LINE_SPACER" value="true" />
</ParserOptions>
</ParserSettings>
<HtmlSettings headerTopEnabled="false" headerBottomEnabled="false" bodyTopEnabled="false" bodyBottomEnabled="false" embedUrlContent="false" addPageHeader="true">
<GeneratorProvider>
<provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.generator" providerName="Default Swing HTML Generator" />
</GeneratorProvider>
<headerTop />
<headerBottom />
<bodyTop />
<bodyBottom />
</HtmlSettings>
<CssSettings previewScheme="UI_SCHEME" cssUri="" isCssUriEnabled="false" isCssTextEnabled="false" isDynamicPageWidth="true">
<StylesheetProvider>
<provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.css" providerName="Default Swing Stylesheet" />
</StylesheetProvider>
<ScriptProviders />
<cssText />
</CssSettings>
<HtmlExportSettings updateOnSave="false" parentDir="$ProjectFileDir$" targetDir="$ProjectFileDir$" cssDir="" scriptDir="" plainHtml="false" imageDir="" copyLinkedImages="false" imageUniquifyType="0" targetExt="" useTargetExt="false" noCssNoScripts="false" linkToExportedHtml="true" exportOnSettingsChange="true" regenerateOnProjectOpen="false" />
<LinkMapSettings>
<textMaps />
</LinkMapSettings>
</component>
</project>
\ No newline at end of file
<component name="MarkdownNavigator.ProfileManager">
<settings default="" pdf-export="" />
</component>
\ No newline at end of file
......@@ -11,4 +11,14 @@
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file:///tmp" />
</component>
</project>
\ No newline at end of file
......
# 2018-JAVA-Cesco
Detecting fraud clicks using machine learning
## execution script
### Amazon Linux
```bash
# update
sudo yum update -y
# install git
sudo yum install git -y
# install maven and java 1.8
sudo wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo
sudo sed -i s/\$releasever/6/g /etc/yum.repos.d/epel-apache-maven.repo
sudo yum install -y apache-maven java-1.8.0-openjdk-devel.x86_64
mvn --version
# clone repo
git clone https://github.com/Java-Cesco/Detecting_fraud_clicks.git
cd Detecting_fraud_clicks
# maven build
mvn package
# run
java8 -jar target/assembly/Detecting_fraud_clicks-aggregation.jar train_sample.csv agg_data
java8 -jar target/assembly/Detecting_fraud_clicks-decisionTree.jar agg_data
```
> NOTE. if you face Memory error using `-Xmx2g` option in `java`
\ No newline at end of file
......
......@@ -16,13 +16,16 @@
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mavnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.11</artifactId>
......@@ -30,19 +33,96 @@
</dependency>
</dependencies>
<!--maven-compiler-plugin-->
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<!-- Aggregation -->
<execution>
<id>aggregation</id>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<source>1.8</source>
<target>1.8</target>
<outputFile>target/assembly/${project.artifactId}-aggregation.jar</outputFile>
<shadedArtifactAttached>true</shadedArtifactAttached>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>detact.Aggregation</mainClass>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
<!-- Decision Tree -->
<execution>
<id>decisionTree</id>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<outputFile>target/assembly/${project.artifactId}-decisionTree.jar</outputFile>
<shadedArtifactAttached>true</shadedArtifactAttached>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>detact.ML.DecisionTree</mainClass>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
<!-- Main -->
<execution>
<id>Main</id>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<outputFile>target/assembly/${project.artifactId}-main.jar</outputFile>
<shadedArtifactAttached>true</shadedArtifactAttached>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>detact.Main</mainClass>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
......
package detact;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import static org.apache.spark.sql.functions.*;
public class Aggregation {
public static void main(String[] args) {
if (args.length != 2) {
System.out.println("Usage: java -jar aggregation.jar <data_path> <result_path>");
System.exit(0);
}
String data_path = args[0];
String result_path = args[1];
//Create Session
SparkSession spark = SparkSession
.builder()
.appName("Detecting Fraud Clicks")
.master("local")
.getOrCreate();
// detact.Aggregation
Aggregation agg = new Aggregation();
Dataset<Row> dataset = Utill.loadCSVDataSet(data_path, spark);
dataset = agg.changeTimestempToLong(dataset);
dataset = agg.averageValidClickCount(dataset);
dataset = agg.clickTimeDelta(dataset);
dataset = agg.countClickInTenMinutes(dataset);
// test
// dataset.where("ip == '5348' and app == '19'").show(10);
// Save to scv
Utill.saveCSVDataSet(dataset, result_path);
}
public Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
// cast timestamp to long
Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
newDF = newDF.drop("click_time").drop("attributed_time");
return newDF;
}
public Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
// set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
WindowSpec w = Window.partitionBy("ip", "app")
.orderBy("utc_click_time")
.rowsBetween(Window.unboundedPreceding(), Window.currentRow());
// aggregation
Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
return newDF;
}
public Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
WindowSpec w = Window.partitionBy ("ip")
.orderBy("utc_click_time");
Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
lit(0)).otherwise(col("lag(utc_click_time)"))));
newDF = newDF.drop("lag(utc_click_time)");
return newDF;
}
public Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
WindowSpec w = Window.partitionBy("ip")
.orderBy("utc_click_time")
.rangeBetween(Window.currentRow(),Window.currentRow()+600);
Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
(count("utc_click_time").over(w)).minus(1));
return newDF;
}
}
package detact.ML;
import detact.Aggregation;
import detact.Utill;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.feature.VectorAssembler;
import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
import org.apache.spark.ml.regression.DecisionTreeRegressor;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
// DecisionTree Model
public class DecisionTree {
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println("Usage: java -jar decisionTree.jar <agg_path>");
System.exit(0);
}
String agg_path = args[0];
//Create Session
SparkSession spark = SparkSession
.builder()
.appName("Detecting Fraud Clicks")
.master("local")
.getOrCreate();
// load aggregated dataset
Dataset<Row> resultds = Utill.loadCSVDataSet(agg_path, spark);
// show Dataset schema
// System.out.println("schema start");
// resultds.printSchema();
// String[] cols = resultds.columns();
// for (String col : cols) {
// System.out.println(col);
// }
// System.out.println("schema end");
VectorAssembler assembler = new VectorAssembler()
.setInputCols(new String[]{
"ip",
"app",
"device",
"os",
"channel",
"utc_click_time",
"avg_valid_click_count",
"click_time_delta",
"count_click_in_ten_mins"
})
.setOutputCol("features");
Dataset<Row> output = assembler.transform(resultds);
VectorIndexerModel featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
.setMaxCategories(2)
.fit(output);
// Split the result into training and test sets (30% held out for testing).
Dataset<Row>[] splits = output.randomSplit(new double[]{0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
// Train a detact.DecisionTreeionTree model.
DecisionTreeRegressor dt = new DecisionTreeRegressor()
.setFeaturesCol("indexedFeatures")
.setLabelCol("is_attributed")
.setMaxDepth(10);
// Chain indexer and tree in a Pipeline.
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{featureIndexer, dt});
// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData);
// Select example rows to display.
predictions.select("is_attributed", "features").show(5);
// Select (prediction, true label) and compute test error.
RegressionEvaluator evaluator = new RegressionEvaluator()
.setLabelCol("is_attributed")
.setPredictionCol("prediction")
.setMetricName("rmse");
double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test result = " + rmse);
DecisionTreeRegressionModel treeModel =
(DecisionTreeRegressionModel) (model.stages()[1]);
System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
// save model
model.save("./decisionTree");
// load model
PipelineModel load_mode = PipelineModel.load("./decisionTree");
// Make predictions.
Dataset<Row> load_pred = model.transform(testData);
}
}
package detact;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.feature.VectorAssembler;
import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
import org.apache.spark.ml.regression.DecisionTreeRegressor;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class Main {
public static void main(String[] args) throws Exception{
if (args.length != 1) {
System.out.println("Usage: java -jar aggregation.jar <data_path>");
System.exit(0);
}
String data_path = args[0];
//Create Session
SparkSession spark = SparkSession
.builder()
.appName("Detecting Fraud Clicks")
.master("local")
.getOrCreate();
// detact.Aggregation
Aggregation agg = new Aggregation();
Dataset<Row> dataset = Utill.loadCSVDataSet(data_path, spark);
dataset = agg.changeTimestempToLong(dataset);
dataset = agg.averageValidClickCount(dataset);
dataset = agg.clickTimeDelta(dataset);
dataset = agg.countClickInTenMinutes(dataset);
VectorAssembler assembler = new VectorAssembler()
.setInputCols(new String[]{
"ip",
"app",
"device",
"os",
"channel",
"utc_click_time",
"avg_valid_click_count",
"click_time_delta",
"count_click_in_ten_mins"
})
.setOutputCol("features");
Dataset<Row> output = assembler.transform(dataset);
VectorIndexerModel featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
.setMaxCategories(2)
.fit(output);
// Split the result into training and test sets (30% held out for testing).
Dataset<Row>[] splits = output.randomSplit(new double[]{0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
// Train a detact.DecisionTreeionTree model.
DecisionTreeRegressor dt = new DecisionTreeRegressor()
.setFeaturesCol("indexedFeatures")
.setLabelCol("is_attributed")
.setMaxDepth(10);
// Chain indexer and tree in a Pipeline.
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{featureIndexer, dt});
// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// save model
model.save("./decisionTree");
PipelineModel p_model = PipelineModel.load("./decisionTree");
// Make predictions.
Dataset<Row> predictions = p_model.transform(testData);
// Select example rows to display.
predictions.select("is_attributed", "features").show(5);
// Select (prediction, true label) and compute test error.
RegressionEvaluator evaluator = new RegressionEvaluator()
.setLabelCol("is_attributed")
.setPredictionCol("prediction")
.setMetricName("rmse");
double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test result = " + rmse);
DecisionTreeRegressionModel treeModel =
(DecisionTreeRegressionModel) (p_model.stages()[1]);
System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
}
}
package detact;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class Utill {
public static Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
// Read SCV to DataSet
return spark.read().format("com.databricks.spark.csv")
.option("inferSchema", "true")
.option("header", "true")
.load(path);
}
public static void saveCSVDataSet(Dataset<Row> dataset, String path){
// Read SCV to DataSet
dataset.write().format("com.databricks.spark.csv")
.option("inferSchema", "true")
.option("header", "true")
.save(path);
}
}
This diff could not be displayed because it is too large.