Tuesday, July 28, 2020

Prediksi Kanker Payudara dengan Python dan Apache Spark

SVM-PSO
In [1]:
from pyspark.sql import SparkSession
!pip install ipython-autotime
%load_ext autotime

from pyspark import SparkConf, SparkContext
spark = SparkSession.builder.appName('SVM-PSO').getOrCreate()

df = spark.read.csv('Dataset/breast_cancer.csv', header = True, inferSchema = True)
df.printSchema()
Requirement already satisfied: ipython-autotime in c:\users\fajar\appdata\local\programs\python\python38\lib\site-packages (0.1)
root
 |-- id: integer (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- radius_mean: double (nullable = true)
 |-- texture_mean: double (nullable = true)
 |-- perimeter_mean: double (nullable = true)
 |-- area_mean: double (nullable = true)
 |-- smoothness_mean: double (nullable = true)
 |-- compactness_mean: double (nullable = true)
 |-- concavity_mean: double (nullable = true)
 |-- concave points_mean: double (nullable = true)
 |-- symmetry_mean: double (nullable = true)
 |-- fractal_dimension_mean: double (nullable = true)
 |-- radius_se: double (nullable = true)
 |-- texture_se: double (nullable = true)
 |-- perimeter_se: double (nullable = true)
 |-- area_se: double (nullable = true)
 |-- smoothness_se: double (nullable = true)
 |-- compactness_se: double (nullable = true)
 |-- concavity_se: double (nullable = true)
 |-- concave points_se: double (nullable = true)
 |-- symmetry_se: double (nullable = true)
 |-- fractal_dimension_se: double (nullable = true)
 |-- radius_worst: double (nullable = true)
 |-- texture_worst: double (nullable = true)
 |-- perimeter_worst: double (nullable = true)
 |-- area_worst: double (nullable = true)
 |-- smoothness_worst: double (nullable = true)
 |-- compactness_worst: double (nullable = true)
 |-- concavity_worst: double (nullable = true)
 |-- concave points_worst: double (nullable = true)
 |-- symmetry_worst: double (nullable = true)
 |-- fractal_dimension_worst: double (nullable = true)
 |-- _c32: string (nullable = true)

In [2]:
import pandas as pd
df = df.drop('id')
X = df.drop('_c32')
f = X.drop('diagnosis')

y = df['diagnosis']
# df.drop('age').collect()
pd.DataFrame(X.take(5), columns=X.columns).transpose()
Out[2]:
0 1 2 3 4
diagnosis M M M M M
radius_mean 17.99 20.57 19.69 11.42 20.29
texture_mean 10.38 17.77 21.25 20.38 14.34
perimeter_mean 122.8 132.9 130 77.58 135.1
area_mean 1001 1326 1203 386.1 1297
smoothness_mean 0.1184 0.08474 0.1096 0.1425 0.1003
compactness_mean 0.2776 0.07864 0.1599 0.2839 0.1328
concavity_mean 0.3001 0.0869 0.1974 0.2414 0.198
concave points_mean 0.1471 0.07017 0.1279 0.1052 0.1043
symmetry_mean 0.2419 0.1812 0.2069 0.2597 0.1809
fractal_dimension_mean 0.07871 0.05667 0.05999 0.09744 0.05883
radius_se 1.095 0.5435 0.7456 0.4956 0.7572
texture_se 0.9053 0.7339 0.7869 1.156 0.7813
perimeter_se 8.589 3.398 4.585 3.445 5.438
area_se 153.4 74.08 94.03 27.23 94.44
smoothness_se 0.006399 0.005225 0.00615 0.00911 0.01149
compactness_se 0.04904 0.01308 0.04006 0.07458 0.02461
concavity_se 0.05373 0.0186 0.03832 0.05661 0.05688
concave points_se 0.01587 0.0134 0.02058 0.01867 0.01885
symmetry_se 0.03003 0.01389 0.0225 0.05963 0.01756
fractal_dimension_se 0.006193 0.003532 0.004571 0.009208 0.005115
radius_worst 25.38 24.99 23.57 14.91 22.54
texture_worst 17.33 23.41 25.53 26.5 16.67
perimeter_worst 184.6 158.8 152.5 98.87 152.2
area_worst 2019 1956 1709 567.7 1575
smoothness_worst 0.1622 0.1238 0.1444 0.2098 0.1374
compactness_worst 0.6656 0.1866 0.4245 0.8663 0.205
concavity_worst 0.7119 0.2416 0.4504 0.6869 0.4
concave points_worst 0.2654 0.186 0.243 0.2575 0.1625
symmetry_worst 0.4601 0.275 0.3613 0.6638 0.2364
fractal_dimension_worst 0.1189 0.08902 0.08758 0.173 0.07678
time: 4.94 s
In [3]:
from pyspark.sql.functions import when, lit
X = X.withColumn('diagnosis', when(X.diagnosis == 'B', lit(0)).otherwise(1))
time: 248 ms
In [4]:
X.groupby('diagnosis').count().toPandas()
Out[4]:
diagnosis count
0 1 212
1 0 357
time: 1.84 s
In [5]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = f.schema.names, outputCol = 'features')
X = vectorAssembler.transform(X)
time: 2.8 s
In [6]:
pd.DataFrame(X.take(5), columns=X.columns).transpose()
Out[6]:
0 1 2 3 4
diagnosis 1 1 1 1 1
radius_mean 17.99 20.57 19.69 11.42 20.29
texture_mean 10.38 17.77 21.25 20.38 14.34
perimeter_mean 122.8 132.9 130 77.58 135.1
area_mean 1001 1326 1203 386.1 1297
smoothness_mean 0.1184 0.08474 0.1096 0.1425 0.1003
compactness_mean 0.2776 0.07864 0.1599 0.2839 0.1328
concavity_mean 0.3001 0.0869 0.1974 0.2414 0.198
concave points_mean 0.1471 0.07017 0.1279 0.1052 0.1043
symmetry_mean 0.2419 0.1812 0.2069 0.2597 0.1809
fractal_dimension_mean 0.07871 0.05667 0.05999 0.09744 0.05883
radius_se 1.095 0.5435 0.7456 0.4956 0.7572
texture_se 0.9053 0.7339 0.7869 1.156 0.7813
perimeter_se 8.589 3.398 4.585 3.445 5.438
area_se 153.4 74.08 94.03 27.23 94.44
smoothness_se 0.006399 0.005225 0.00615 0.00911 0.01149
compactness_se 0.04904 0.01308 0.04006 0.07458 0.02461
concavity_se 0.05373 0.0186 0.03832 0.05661 0.05688
concave points_se 0.01587 0.0134 0.02058 0.01867 0.01885
symmetry_se 0.03003 0.01389 0.0225 0.05963 0.01756
fractal_dimension_se 0.006193 0.003532 0.004571 0.009208 0.005115
radius_worst 25.38 24.99 23.57 14.91 22.54
texture_worst 17.33 23.41 25.53 26.5 16.67
perimeter_worst 184.6 158.8 152.5 98.87 152.2
area_worst 2019 1956 1709 567.7 1575
smoothness_worst 0.1622 0.1238 0.1444 0.2098 0.1374
compactness_worst 0.6656 0.1866 0.4245 0.8663 0.205
concavity_worst 0.7119 0.2416 0.4504 0.6869 0.4
concave points_worst 0.2654 0.186 0.243 0.2575 0.1625
symmetry_worst 0.4601 0.275 0.3613 0.6638 0.2364
fractal_dimension_worst 0.1189 0.08902 0.08758 0.173 0.07678
features [17.99, 10.38, 122.8, 1001.0, 0.1184, 0.2776, ... [20.57, 17.77, 132.9, 1326.0, 0.08474, 0.07864... [19.69, 21.25, 130.0, 1203.0, 0.1096, 0.1599, ... [11.42, 20.38, 77.58, 386.1, 0.1425, 0.2839, 0... [20.29, 14.34, 135.1, 1297.0, 0.1003, 0.1328, ...
time: 444 ms
In [7]:
train, test = X.randomSplit([0.7, 0.3], seed = 42)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))
Training Dataset Count: 426
Test Dataset Count: 143
time: 857 ms

Logistic Regression

In [8]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# assembler = VectorAssembler(inputCols=[list_of_header_names],outputCol="features")
lr = LogisticRegression(featuresCol = 'features', labelCol = 'diagnosis')
lrModel = lr.fit(train)
predictions = lrModel.transform(test)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Accuracy 0.9300699300699301
time: 11.2 s

Decission Tree

In [9]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="diagnosis", featuresCol="features")
dtModel = dt.fit(train)
predictions = dtModel.transform(test)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Accuracy 0.965034965034965
time: 2.02 s

Naive Bayes

In [10]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(labelCol="diagnosis", featuresCol="features")
nbModel = nb.fit(train)
predictions = nbModel.transform(test)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
time: 888 ms

SVM

In [11]:
from pyspark.ml.classification import LinearSVC

svm = LinearSVC(labelCol="diagnosis", featuresCol="features")
svmModel = svm.fit(train)
predictions = svmModel.transform(test)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Accuracy 0.958041958041958
time: 6.36 s

No comments:

Post a Comment

Beneran

Testsfsasfss