Toggle navigation
Toggle navigation
This project
Loading...
Sign in
신은섭(Shin Eun Seop)
/
Detecting_fraud_clicks
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
2
Merge Requests
0
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
EC2 Default User
2018-06-12 13:01:46 +0000
Browse Files
Options
Browse Files
Download
Plain Diff
Commit
45684aa6288c3cc3ce1c56dcac6a83c78a88cf2d
45684aa6
2 parents
70ed7f20
d7db0a3d
Merge branch 'ml' of
https://github.com/Java-Cesco/Detecting_fraud_clicks
into ml
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
149 additions
and
4 deletions
pom.xml
src/main/java/detact/Aggregation.java
src/main/java/detact/ML/DecisionTree.java
src/main/java/detact/Main.java
pom.xml
View file @
45684aa
...
...
@@ -39,6 +39,7 @@
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-shade-plugin
</artifactId>
<executions>
<!-- Aggregation -->
<execution>
<id>
aggregation
</id>
<goals>
...
...
@@ -51,6 +52,7 @@
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<mainClass>
detact.Aggregation
</mainClass>
</transformer>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
</transformers>
<filters>
<filter>
...
...
@@ -64,6 +66,7 @@
</filters>
</configuration>
</execution>
<!-- Decision Tree -->
<execution>
<id>
decisionTree
</id>
<goals>
...
...
@@ -76,6 +79,34 @@
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<mainClass>
detact.ML.DecisionTree
</mainClass>
</transformer>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
</transformers>
<filters>
<filter>
<artifact>
*:*
</artifact>
<excludes>
<exclude>
META-INF/*.SF
</exclude>
<exclude>
META-INF/*.DSA
</exclude>
<exclude>
META-INF/*.RSA
</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
<!-- Main -->
<execution>
<id>
Main
</id>
<goals>
<goal>
shade
</goal>
</goals>
<configuration>
<outputFile>
target/assembly/${project.artifactId}-main.jar
</outputFile>
<shadedArtifactAttached>
true
</shadedArtifactAttached>
<transformers>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<mainClass>
detact.Main
</mainClass>
</transformer>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
</transformers>
<filters>
<filter>
...
...
src/main/java/detact/Aggregation.java
View file @
45684aa
...
...
@@ -44,7 +44,7 @@ public class Aggregation {
Utill
.
saveCSVDataSet
(
dataset
,
result_path
);
}
p
rivate
Dataset
<
Row
>
changeTimestempToLong
(
Dataset
<
Row
>
dataset
){
p
ublic
Dataset
<
Row
>
changeTimestempToLong
(
Dataset
<
Row
>
dataset
){
// cast timestamp to long
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"utc_click_time"
,
dataset
.
col
(
"click_time"
).
cast
(
"long"
));
newDF
=
newDF
.
withColumn
(
"utc_attributed_time"
,
dataset
.
col
(
"attributed_time"
).
cast
(
"long"
));
...
...
@@ -52,7 +52,7 @@ public class Aggregation {
return
newDF
;
}
p
rivate
Dataset
<
Row
>
averageValidClickCount
(
Dataset
<
Row
>
dataset
){
p
ublic
Dataset
<
Row
>
averageValidClickCount
(
Dataset
<
Row
>
dataset
){
// set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
,
"app"
)
.
orderBy
(
"utc_click_time"
)
...
...
@@ -66,7 +66,7 @@ public class Aggregation {
return
newDF
;
}
p
rivate
Dataset
<
Row
>
clickTimeDelta
(
Dataset
<
Row
>
dataset
){
p
ublic
Dataset
<
Row
>
clickTimeDelta
(
Dataset
<
Row
>
dataset
){
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
)
.
orderBy
(
"utc_click_time"
);
...
...
@@ -78,7 +78,7 @@ public class Aggregation {
return
newDF
;
}
p
rivate
Dataset
<
Row
>
countClickInTenMinutes
(
Dataset
<
Row
>
dataset
){
p
ublic
Dataset
<
Row
>
countClickInTenMinutes
(
Dataset
<
Row
>
dataset
){
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
)
.
orderBy
(
"utc_click_time"
)
.
rangeBetween
(
Window
.
currentRow
(),
Window
.
currentRow
()+
600
);
...
...
src/main/java/detact/ML/DecisionTree.java
View file @
45684aa
...
...
@@ -107,6 +107,15 @@ public class DecisionTree {
(
DecisionTreeRegressionModel
)
(
model
.
stages
()[
1
]);
System
.
out
.
println
(
"Learned regression tree model:\n"
+
treeModel
.
toDebugString
());
// save model
model
.
save
(
"./decisionTree"
);
// load model
PipelineModel
load_mode
=
PipelineModel
.
load
(
"./decisionTree"
);
// Make predictions.
Dataset
<
Row
>
load_pred
=
model
.
transform
(
testData
);
}
}
...
...
src/main/java/detact/Main.java
0 → 100644
View file @
45684aa
package
detact
;
import
org.apache.spark.ml.Pipeline
;
import
org.apache.spark.ml.PipelineModel
;
import
org.apache.spark.ml.PipelineStage
;
import
org.apache.spark.ml.evaluation.RegressionEvaluator
;
import
org.apache.spark.ml.feature.VectorAssembler
;
import
org.apache.spark.ml.feature.VectorIndexer
;
import
org.apache.spark.ml.feature.VectorIndexerModel
;
import
org.apache.spark.ml.regression.DecisionTreeRegressionModel
;
import
org.apache.spark.ml.regression.DecisionTreeRegressor
;
import
org.apache.spark.sql.Dataset
;
import
org.apache.spark.sql.Row
;
import
org.apache.spark.sql.SparkSession
;
public
class
Main
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
if
(
args
.
length
!=
1
)
{
System
.
out
.
println
(
"Usage: java -jar aggregation.jar <data_path>"
);
System
.
exit
(
0
);
}
String
data_path
=
args
[
0
];
//Create Session
SparkSession
spark
=
SparkSession
.
builder
()
.
appName
(
"Detecting Fraud Clicks"
)
.
master
(
"local"
)
.
getOrCreate
();
// detact.Aggregation
Aggregation
agg
=
new
Aggregation
();
Dataset
<
Row
>
dataset
=
Utill
.
loadCSVDataSet
(
data_path
,
spark
);
dataset
=
agg
.
changeTimestempToLong
(
dataset
);
dataset
=
agg
.
averageValidClickCount
(
dataset
);
dataset
=
agg
.
clickTimeDelta
(
dataset
);
dataset
=
agg
.
countClickInTenMinutes
(
dataset
);
VectorAssembler
assembler
=
new
VectorAssembler
()
.
setInputCols
(
new
String
[]{
"ip"
,
"app"
,
"device"
,
"os"
,
"channel"
,
"utc_click_time"
,
"avg_valid_click_count"
,
"click_time_delta"
,
"count_click_in_ten_mins"
})
.
setOutputCol
(
"features"
);
Dataset
<
Row
>
output
=
assembler
.
transform
(
dataset
);
VectorIndexerModel
featureIndexer
=
new
VectorIndexer
()
.
setInputCol
(
"features"
)
.
setOutputCol
(
"indexedFeatures"
)
.
setMaxCategories
(
2
)
.
fit
(
output
);
// Split the result into training and test sets (30% held out for testing).
Dataset
<
Row
>[]
splits
=
output
.
randomSplit
(
new
double
[]{
0.7
,
0.3
});
Dataset
<
Row
>
trainingData
=
splits
[
0
];
Dataset
<
Row
>
testData
=
splits
[
1
];
// Train a detact.DecisionTreeionTree model.
DecisionTreeRegressor
dt
=
new
DecisionTreeRegressor
()
.
setFeaturesCol
(
"indexedFeatures"
)
.
setLabelCol
(
"is_attributed"
)
.
setMaxDepth
(
10
);
// Chain indexer and tree in a Pipeline.
Pipeline
pipeline
=
new
Pipeline
()
.
setStages
(
new
PipelineStage
[]{
featureIndexer
,
dt
});
// Train model. This also runs the indexer.
PipelineModel
model
=
pipeline
.
fit
(
trainingData
);
// save model
model
.
save
(
"./decisionTree"
);
PipelineModel
p_model
=
PipelineModel
.
load
(
"./decisionTree"
);
// Make predictions.
Dataset
<
Row
>
predictions
=
p_model
.
transform
(
testData
);
// Select example rows to display.
predictions
.
select
(
"is_attributed"
,
"features"
).
show
(
5
);
// Select (prediction, true label) and compute test error.
RegressionEvaluator
evaluator
=
new
RegressionEvaluator
()
.
setLabelCol
(
"is_attributed"
)
.
setPredictionCol
(
"prediction"
)
.
setMetricName
(
"rmse"
);
double
rmse
=
evaluator
.
evaluate
(
predictions
);
System
.
out
.
println
(
"Root Mean Squared Error (RMSE) on test result = "
+
rmse
);
DecisionTreeRegressionModel
treeModel
=
(
DecisionTreeRegressionModel
)
(
p_model
.
stages
()[
1
]);
System
.
out
.
println
(
"Learned regression tree model:\n"
+
treeModel
.
toDebugString
());
}
}
Please
register
or
login
to post a comment