Merge branch 'ml' of https://github.com/Java-Cesco/Detecting_fraud_clicks into feauture/GUI_2

KimchiSoup(junu)
Commit 7a180ed318af5ddc9fb9e27c24f75d6226b77898 7a180ed3 2 parents b8d8bcf0 d7db0a3d
Showing 14 changed files with 556 additions and 8 deletions
.gitignore
.idea/.name
.idea/Detecting_fraud_clicks.iml
.idea/markdown-exported-files.xml
.idea/markdown-navigator.xml
.idea/markdown-navigator/profiles_settings.xml
.idea/misc.xml
README.md
pom.xml
src/main/java/detact/Aggregation.java
src/main/java/detact/ML/DecisionTree.java
src/main/java/detact/Main.java
src/main/java/detact/Utill.java
train_sample.csv
--- a/.gitignore
View file @7a180ed
+++ b/.gitignore
View file @7a180ed
@@ -75,3 +75,8 @@ fabric.properties
 # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 hs_err_pid*
+
+
+# datafile
+train.zip
+train.csv
\ No newline at end of file
--- a/.idea/.name 0 → 100644
View file @7a180ed
+++ b/.idea/.name 0 → 100644
View file @7a180ed
+Detecting_fraud_clicks
\ No newline at end of file
--- a/.idea/Detecting_fraud_clicks.iml 0 → 100644
View file @7a180ed
+++ b/.idea/Detecting_fraud_clicks.iml 0 → 100644
View file @7a180ed
--- a/.idea/markdown-exported-files.xml 0 → 100644
View file @7a180ed
+++ b/.idea/markdown-exported-files.xml 0 → 100644
View file @7a180ed
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MarkdownExportedFiles">
+    <htmlFiles />
+    <imageFiles />
+    <otherFiles />
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/markdown-navigator.xml 0 → 100644
View file @7a180ed
+++ b/.idea/markdown-navigator.xml 0 → 100644
View file @7a180ed
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MarkdownProjectSettings">
+    <PreviewSettings splitEditorLayout="SPLIT" splitEditorPreview="PREVIEW" useGrayscaleRendering="false" zoomFactor="1.0" maxImageWidth="0" showGitHubPageIfSynced="false" allowBrowsingInPreview="false" synchronizePreviewPosition="true" highlightPreviewType="NONE" highlightFadeOut="5" highlightOnTyping="true" synchronizeSourcePosition="true" verticallyAlignSourceAndPreviewSyncPosition="true" showSearchHighlightsInPreview="false" showSelectionInPreview="true">
+      <PanelProvider>
+        <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.panel" providerName="Default - Swing" />
+      </PanelProvider>
+    </PreviewSettings>
+    <ParserSettings gitHubSyntaxChange="false">
+      <PegdownExtensions>
+        <option name="ABBREVIATIONS" value="false" />
+        <option name="ANCHORLINKS" value="true" />
+        <option name="ASIDE" value="false" />
+        <option name="ATXHEADERSPACE" value="true" />
+        <option name="AUTOLINKS" value="true" />
+        <option name="DEFINITIONS" value="false" />
+        <option name="DEFINITION_BREAK_DOUBLE_BLANK_LINE" value="false" />
+        <option name="FENCED_CODE_BLOCKS" value="true" />
+        <option name="FOOTNOTES" value="false" />
+        <option name="HARDWRAPS" value="false" />
+        <option name="HTML_DEEP_PARSER" value="false" />
+        <option name="INSERTED" value="false" />
+        <option name="QUOTES" value="false" />
+        <option name="RELAXEDHRULES" value="true" />
+        <option name="SMARTS" value="false" />
+        <option name="STRIKETHROUGH" value="true" />
+        <option name="SUBSCRIPT" value="false" />
+        <option name="SUPERSCRIPT" value="false" />
+        <option name="SUPPRESS_HTML_BLOCKS" value="false" />
+        <option name="SUPPRESS_INLINE_HTML" value="false" />
+        <option name="TABLES" value="true" />
+        <option name="TASKLISTITEMS" value="true" />
+        <option name="TOC" value="false" />
+        <option name="WIKILINKS" value="true" />
+      </PegdownExtensions>
+      <ParserOptions>
+        <option name="COMMONMARK_LISTS" value="true" />
+        <option name="DUMMY" value="false" />
+        <option name="EMOJI_SHORTCUTS" value="true" />
+        <option name="FLEXMARK_FRONT_MATTER" value="false" />
+        <option name="GFM_LOOSE_BLANK_LINE_AFTER_ITEM_PARA" value="false" />
+        <option name="GFM_TABLE_RENDERING" value="true" />
+        <option name="GITBOOK_URL_ENCODING" value="false" />
+        <option name="GITHUB_EMOJI_URL" value="false" />
+        <option name="GITHUB_LISTS" value="false" />
+        <option name="GITHUB_WIKI_LINKS" value="true" />
+        <option name="JEKYLL_FRONT_MATTER" value="false" />
+        <option name="SIM_TOC_BLANK_LINE_SPACER" value="true" />
+      </ParserOptions>
+    </ParserSettings>
+    <HtmlSettings headerTopEnabled="false" headerBottomEnabled="false" bodyTopEnabled="false" bodyBottomEnabled="false" embedUrlContent="false" addPageHeader="true">
+      <GeneratorProvider>
+        <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.generator" providerName="Default Swing HTML Generator" />
+      </GeneratorProvider>
+      <headerTop />
+      <headerBottom />
+      <bodyTop />
+      <bodyBottom />
+    </HtmlSettings>
+    <CssSettings previewScheme="UI_SCHEME" cssUri="" isCssUriEnabled="false" isCssTextEnabled="false" isDynamicPageWidth="true">
+      <StylesheetProvider>
+        <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.css" providerName="Default Swing Stylesheet" />
+      </StylesheetProvider>
+      <ScriptProviders />
+      <cssText />
+    </CssSettings>
+    <HtmlExportSettings updateOnSave="false" parentDir="$ProjectFileDir$" targetDir="$ProjectFileDir$" cssDir="" scriptDir="" plainHtml="false" imageDir="" copyLinkedImages="false" imageUniquifyType="0" targetExt="" useTargetExt="false" noCssNoScripts="false" linkToExportedHtml="true" exportOnSettingsChange="true" regenerateOnProjectOpen="false" />
+    <LinkMapSettings>
+      <textMaps />
+    </LinkMapSettings>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/markdown-navigator/profiles_settings.xml 0 → 100644
View file @7a180ed
+++ b/.idea/markdown-navigator/profiles_settings.xml 0 → 100644
View file @7a180ed
+<component name="MarkdownNavigator.ProfileManager">
+  <settings default="" pdf-export="" />
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
View file @7a180ed
+++ b/.idea/misc.xml
View file @7a180ed
@@ -11,4 +11,14 @@
   <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" project-jdk-name="1.8" project-jdk-type="JavaSDK">
     <output url="file://$PROJECT_DIR$/out" />
   </component>
+  <component name="MavenProjectsManager">
+    <option name="originalFiles">
+      <list>
+        <option value="$PROJECT_DIR$/pom.xml" />
+      </list>
+    </option>
+  </component>
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK">
+    <output url="file:///tmp" />
+  </component>
 </project>
\ No newline at end of file
--- a/README.md
View file @7a180ed
+++ b/README.md
View file @7a180ed
 # 2018-JAVA-Cesco
 Detecting fraud clicks using machine learning 
+
+## execution script
+### Amazon Linux
+```bash
+# update
+sudo yum update -y
+
+# install git
+sudo yum install git -y
+
+# install maven and java 1.8
+sudo wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo
+sudo sed -i s/\$releasever/6/g /etc/yum.repos.d/epel-apache-maven.repo
+sudo yum install -y apache-maven java-1.8.0-openjdk-devel.x86_64
+
+mvn --version
+
+# clone repo
+git clone https://github.com/Java-Cesco/Detecting_fraud_clicks.git
+cd Detecting_fraud_clicks
+
+# maven build
+mvn package
+
+# run
+java8 -jar target/assembly/Detecting_fraud_clicks-aggregation.jar train_sample.csv agg_data
+java8 -jar target/assembly/Detecting_fraud_clicks-decisionTree.jar agg_data
+
+```
+> NOTE. if you face Memory error using `-Xmx2g` option in `java`
\ No newline at end of file
--- a/pom.xml
View file @7a180ed
+++ b/pom.xml
View file @7a180ed
@@ -16,13 +16,16 @@
             <artifactId>spark-core_2.11</artifactId>
             <version>2.3.0</version>
         </dependency>
-	<!-- https://mavnrepository.com/artifact/org.apache.spark/spark-sql -->
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-mllib_2.11</artifactId>
+            <version>2.3.0</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_2.11</artifactId>
             <version>2.3.0</version>
         </dependency>
-
         <dependency>
             <groupId>com.databricks</groupId>
             <artifactId>spark-csv_2.11</artifactId>
@@ -30,19 +33,96 @@
         </dependency>
     </dependencies>
-
-    <!--maven-compiler-plugin-->
     <build>
         <plugins>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
-		<artifactId>maven-compiler-plugin</artifactId>
+                <artifactId>maven-shade-plugin</artifactId>
-		<version>3.1</version>
+                <executions>
+                    <!-- Aggregation -->
+                    <execution>
+                        <id>aggregation</id>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
                         <configuration>
-			<source>1.8</source>
+                            <outputFile>target/assembly/${project.artifactId}-aggregation.jar</outputFile>
-			<target>1.8</target>
+                            <shadedArtifactAttached>true</shadedArtifactAttached>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>detact.Aggregation</mainClass>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
                         </configuration>
+                    </execution>
+                    <!-- Decision Tree -->
+                    <execution>
+                        <id>decisionTree</id>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <outputFile>target/assembly/${project.artifactId}-decisionTree.jar</outputFile>
+                            <shadedArtifactAttached>true</shadedArtifactAttached>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>detact.ML.DecisionTree</mainClass>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                        </configuration>
+                    </execution>
+                    <!-- Main -->
+                    <execution>
+                        <id>Main</id>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <outputFile>target/assembly/${project.artifactId}-main.jar</outputFile>
+                            <shadedArtifactAttached>true</shadedArtifactAttached>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>detact.Main</mainClass>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                        </configuration>
+                    </execution>
+                </executions>
             </plugin>
         </plugins>
     </build>
+    
 </project>
\ No newline at end of file
--- a/src/main/java/detact/Aggregation.java 0 → 100644
View file @7a180ed
+++ b/src/main/java/detact/Aggregation.java 0 → 100644
View file @7a180ed
+package detact;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.expressions.Window;
+import org.apache.spark.sql.expressions.WindowSpec;
+
+import static org.apache.spark.sql.functions.*;
+
+public class Aggregation {
+
+    public static void main(String[] args) {
+        
+        if (args.length != 2) {
+            System.out.println("Usage: java -jar aggregation.jar <data_path> <result_path>");
+            System.exit(0);
+        }
+        
+        String data_path = args[0];
+        String result_path = args[1];
+        
+        //Create Session
+        SparkSession spark = SparkSession
+                .builder()
+                .appName("Detecting Fraud Clicks")
+                .master("local")
+                .getOrCreate();
+        
+        // detact.Aggregation
+        Aggregation agg = new Aggregation();
+        
+        Dataset<Row> dataset = Utill.loadCSVDataSet(data_path, spark);
+        dataset = agg.changeTimestempToLong(dataset);
+        dataset = agg.averageValidClickCount(dataset);
+        dataset = agg.clickTimeDelta(dataset);
+        dataset = agg.countClickInTenMinutes(dataset);
+        
+        // test
+//        dataset.where("ip == '5348' and app == '19'").show(10);
+        
+        // Save to scv
+        Utill.saveCSVDataSet(dataset, result_path);
+    }
+    
+    public Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
+        // cast timestamp to long
+        Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
+        newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
+        newDF = newDF.drop("click_time").drop("attributed_time");
+        return newDF;
+    }
+         
+    public Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
+        // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
+        WindowSpec w = Window.partitionBy("ip", "app")
+                .orderBy("utc_click_time")
+                .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
+
+        // aggregation
+        Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
+        newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
+        newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
+        newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
+        return newDF;
+    }
+
+    public Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
+        WindowSpec w = Window.partitionBy ("ip")
+                .orderBy("utc_click_time");
+
+        Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
+        newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
+                lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
+                lit(0)).otherwise(col("lag(utc_click_time)"))));
+        newDF = newDF.drop("lag(utc_click_time)");
+        return newDF;
+    }
+    
+    public Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
+        WindowSpec w = Window.partitionBy("ip")
+                .orderBy("utc_click_time")
+                .rangeBetween(Window.currentRow(),Window.currentRow()+600);
+
+        Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
+                (count("utc_click_time").over(w)).minus(1));  
+        return newDF;
+    }
+    
+}
--- a/src/main/java/detact/ML/DecisionTree.java 0 → 100644
View file @7a180ed
+++ b/src/main/java/detact/ML/DecisionTree.java 0 → 100644
View file @7a180ed
+package detact.ML;
+
+import detact.Aggregation;
+import detact.Utill;
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
+import org.apache.spark.ml.regression.DecisionTreeRegressor;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+
+// DecisionTree Model
+
+public class DecisionTree {
+    
+    public static void main(String[] args) throws Exception {
+        
+        if (args.length != 1) {
+            System.out.println("Usage: java -jar decisionTree.jar <agg_path>");
+            System.exit(0);
+        }
+        
+        String agg_path = args[0];
+        
+        //Create Session
+        SparkSession spark = SparkSession
+                .builder()
+                .appName("Detecting Fraud Clicks")
+                .master("local")
+                .getOrCreate();
+        
+        // load aggregated dataset
+        Dataset<Row> resultds = Utill.loadCSVDataSet(agg_path, spark);
+
+        // show Dataset schema
+//        System.out.println("schema start");
+//        resultds.printSchema();
+//        String[] cols = resultds.columns();
+//        for (String col : cols) {
+//            System.out.println(col);
+//        }
+//        System.out.println("schema end");
+
+        VectorAssembler assembler = new VectorAssembler()
+                .setInputCols(new String[]{
+                        "ip", 
+                        "app", 
+                        "device", 
+                        "os", 
+                        "channel", 
+                        "utc_click_time", 
+                        "avg_valid_click_count", 
+                        "click_time_delta",
+                        "count_click_in_ten_mins"
+                })
+                .setOutputCol("features");
+
+        Dataset<Row> output = assembler.transform(resultds);
+        
+        VectorIndexerModel featureIndexer = new VectorIndexer()
+                .setInputCol("features")
+                .setOutputCol("indexedFeatures")
+                .setMaxCategories(2)
+                .fit(output);
+
+        // Split the result into training and test sets (30% held out for testing).
+        Dataset<Row>[] splits = output.randomSplit(new double[]{0.7, 0.3});
+        Dataset<Row> trainingData = splits[0];
+        Dataset<Row> testData = splits[1];
+
+        // Train a detact.DecisionTreeionTree model.
+        DecisionTreeRegressor dt = new DecisionTreeRegressor()
+                .setFeaturesCol("indexedFeatures")
+                .setLabelCol("is_attributed")
+                .setMaxDepth(10);
+
+        // Chain indexer and tree in a Pipeline.
+        Pipeline pipeline = new Pipeline()
+                .setStages(new PipelineStage[]{featureIndexer, dt});
+
+        // Train model. This also runs the indexer.
+        PipelineModel model = pipeline.fit(trainingData);
+
+        // Make predictions.
+        Dataset<Row> predictions = model.transform(testData);
+
+        // Select example rows to display.
+        predictions.select("is_attributed", "features").show(5);
+
+        // Select (prediction, true label) and compute test error.
+        RegressionEvaluator evaluator = new RegressionEvaluator()
+                .setLabelCol("is_attributed")
+                .setPredictionCol("prediction")
+                .setMetricName("rmse");
+        double rmse = evaluator.evaluate(predictions);
+        System.out.println("Root Mean Squared Error (RMSE) on test result = " + rmse);
+        
+        DecisionTreeRegressionModel treeModel =
+                (DecisionTreeRegressionModel) (model.stages()[1]);
+        System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
+        
+        // save model
+        model.save("./decisionTree");
+        
+        // load model 
+        PipelineModel load_mode = PipelineModel.load("./decisionTree");
+
+        // Make predictions.
+        Dataset<Row> load_pred = model.transform(testData);
+        
+    }
+    
+}
--- a/src/main/java/detact/Main.java 0 → 100644
View file @7a180ed
+++ b/src/main/java/detact/Main.java 0 → 100644
View file @7a180ed
+package detact;
+
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
+import org.apache.spark.ml.regression.DecisionTreeRegressor;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public class Main {
+    public static void main(String[] args) throws Exception{
+        if (args.length != 1) {
+            System.out.println("Usage: java -jar aggregation.jar <data_path>");
+            System.exit(0);
+        }
+
+        String data_path = args[0];
+
+        //Create Session
+        SparkSession spark = SparkSession
+                .builder()
+                .appName("Detecting Fraud Clicks")
+                .master("local")
+                .getOrCreate();
+
+        // detact.Aggregation
+        Aggregation agg = new Aggregation();
+
+        Dataset<Row> dataset = Utill.loadCSVDataSet(data_path, spark);
+        dataset = agg.changeTimestempToLong(dataset);
+        dataset = agg.averageValidClickCount(dataset);
+        dataset = agg.clickTimeDelta(dataset);
+        dataset = agg.countClickInTenMinutes(dataset);
+
+        VectorAssembler assembler = new VectorAssembler()
+                .setInputCols(new String[]{
+                        "ip",
+                        "app",
+                        "device",
+                        "os",
+                        "channel",
+                        "utc_click_time",
+                        "avg_valid_click_count",
+                        "click_time_delta",
+                        "count_click_in_ten_mins"
+                })
+                .setOutputCol("features");
+
+        Dataset<Row> output = assembler.transform(dataset);
+
+        VectorIndexerModel featureIndexer = new VectorIndexer()
+                .setInputCol("features")
+                .setOutputCol("indexedFeatures")
+                .setMaxCategories(2)
+                .fit(output);
+
+        // Split the result into training and test sets (30% held out for testing).
+        Dataset<Row>[] splits = output.randomSplit(new double[]{0.7, 0.3});
+        Dataset<Row> trainingData = splits[0];
+        Dataset<Row> testData = splits[1];
+
+        // Train a detact.DecisionTreeionTree model.
+        DecisionTreeRegressor dt = new DecisionTreeRegressor()
+                .setFeaturesCol("indexedFeatures")
+                .setLabelCol("is_attributed")
+                .setMaxDepth(10);
+
+        // Chain indexer and tree in a Pipeline.
+        Pipeline pipeline = new Pipeline()
+                .setStages(new PipelineStage[]{featureIndexer, dt});
+
+        // Train model. This also runs the indexer.
+        PipelineModel model = pipeline.fit(trainingData);
+
+        // save model
+        model.save("./decisionTree");
+
+        PipelineModel p_model = PipelineModel.load("./decisionTree");
+        
+        // Make predictions.
+        Dataset<Row> predictions = p_model.transform(testData);
+
+        // Select example rows to display.
+        predictions.select("is_attributed", "features").show(5);
+
+        // Select (prediction, true label) and compute test error.
+        RegressionEvaluator evaluator = new RegressionEvaluator()
+                .setLabelCol("is_attributed")
+                .setPredictionCol("prediction")
+                .setMetricName("rmse");
+        double rmse = evaluator.evaluate(predictions);
+        System.out.println("Root Mean Squared Error (RMSE) on test result = " + rmse);
+
+        DecisionTreeRegressionModel treeModel =
+                (DecisionTreeRegressionModel) (p_model.stages()[1]);
+        System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
+        
+    }
+}
--- a/src/main/java/detact/Utill.java 0 → 100644
View file @7a180ed
+++ b/src/main/java/detact/Utill.java 0 → 100644
View file @7a180ed
+package detact;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public class Utill {
+    
+    public static Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
+        // Read SCV to DataSet
+        return spark.read().format("com.databricks.spark.csv")
+                .option("inferSchema", "true")
+                .option("header", "true")
+                .load(path);
+    }
+
+    public static void saveCSVDataSet(Dataset<Row> dataset, String path){
+        // Read SCV to DataSet
+        dataset.write().format("com.databricks.spark.csv")
+                .option("inferSchema", "true")
+                .option("header", "true")
+                .save(path);
+    }
+}
--- a/train_sample.csv 0 → 100644
View file @7a180ed
+++ b/train_sample.csv 0 → 100644
View file @7a180ed