Tuesday, July 28, 2020
Prediksi Kanker Payudara dengan Python dan Apache Spark
In [1]:
from pyspark.sql import SparkSession
!pip install ipython-autotime
%load_ext autotime
from pyspark import SparkConf, SparkContext
spark = SparkSession.builder.appName('SVM-PSO').getOrCreate()
df = spark.read.csv('Dataset/breast_cancer.csv', header = True, inferSchema = True)
df.printSchema()
Requirement already satisfied: ipython-autotime in c:\users\fajar\appdata\local\programs\python\python38\lib\site-packages (0.1) root |-- id: integer (nullable = true) |-- diagnosis: string (nullable = true) |-- radius_mean: double (nullable = true) |-- texture_mean: double (nullable = true) |-- perimeter_mean: double (nullable = true) |-- area_mean: double (nullable = true) |-- smoothness_mean: double (nullable = true) |-- compactness_mean: double (nullable = true) |-- concavity_mean: double (nullable = true) |-- concave points_mean: double (nullable = true) |-- symmetry_mean: double (nullable = true) |-- fractal_dimension_mean: double (nullable = true) |-- radius_se: double (nullable = true) |-- texture_se: double (nullable = true) |-- perimeter_se: double (nullable = true) |-- area_se: double (nullable = true) |-- smoothness_se: double (nullable = true) |-- compactness_se: double (nullable = true) |-- concavity_se: double (nullable = true) |-- concave points_se: double (nullable = true) |-- symmetry_se: double (nullable = true) |-- fractal_dimension_se: double (nullable = true) |-- radius_worst: double (nullable = true) |-- texture_worst: double (nullable = true) |-- perimeter_worst: double (nullable = true) |-- area_worst: double (nullable = true) |-- smoothness_worst: double (nullable = true) |-- compactness_worst: double (nullable = true) |-- concavity_worst: double (nullable = true) |-- concave points_worst: double (nullable = true) |-- symmetry_worst: double (nullable = true) |-- fractal_dimension_worst: double (nullable = true) |-- _c32: string (nullable = true)
In [2]:
import pandas as pd
df = df.drop('id')
X = df.drop('_c32')
f = X.drop('diagnosis')
y = df['diagnosis']
# df.drop('age').collect()
pd.DataFrame(X.take(5), columns=X.columns).transpose()
Out[2]:
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| diagnosis | M | M | M | M | M |
| radius_mean | 17.99 | 20.57 | 19.69 | 11.42 | 20.29 |
| texture_mean | 10.38 | 17.77 | 21.25 | 20.38 | 14.34 |
| perimeter_mean | 122.8 | 132.9 | 130 | 77.58 | 135.1 |
| area_mean | 1001 | 1326 | 1203 | 386.1 | 1297 |
| smoothness_mean | 0.1184 | 0.08474 | 0.1096 | 0.1425 | 0.1003 |
| compactness_mean | 0.2776 | 0.07864 | 0.1599 | 0.2839 | 0.1328 |
| concavity_mean | 0.3001 | 0.0869 | 0.1974 | 0.2414 | 0.198 |
| concave points_mean | 0.1471 | 0.07017 | 0.1279 | 0.1052 | 0.1043 |
| symmetry_mean | 0.2419 | 0.1812 | 0.2069 | 0.2597 | 0.1809 |
| fractal_dimension_mean | 0.07871 | 0.05667 | 0.05999 | 0.09744 | 0.05883 |
| radius_se | 1.095 | 0.5435 | 0.7456 | 0.4956 | 0.7572 |
| texture_se | 0.9053 | 0.7339 | 0.7869 | 1.156 | 0.7813 |
| perimeter_se | 8.589 | 3.398 | 4.585 | 3.445 | 5.438 |
| area_se | 153.4 | 74.08 | 94.03 | 27.23 | 94.44 |
| smoothness_se | 0.006399 | 0.005225 | 0.00615 | 0.00911 | 0.01149 |
| compactness_se | 0.04904 | 0.01308 | 0.04006 | 0.07458 | 0.02461 |
| concavity_se | 0.05373 | 0.0186 | 0.03832 | 0.05661 | 0.05688 |
| concave points_se | 0.01587 | 0.0134 | 0.02058 | 0.01867 | 0.01885 |
| symmetry_se | 0.03003 | 0.01389 | 0.0225 | 0.05963 | 0.01756 |
| fractal_dimension_se | 0.006193 | 0.003532 | 0.004571 | 0.009208 | 0.005115 |
| radius_worst | 25.38 | 24.99 | 23.57 | 14.91 | 22.54 |
| texture_worst | 17.33 | 23.41 | 25.53 | 26.5 | 16.67 |
| perimeter_worst | 184.6 | 158.8 | 152.5 | 98.87 | 152.2 |
| area_worst | 2019 | 1956 | 1709 | 567.7 | 1575 |
| smoothness_worst | 0.1622 | 0.1238 | 0.1444 | 0.2098 | 0.1374 |
| compactness_worst | 0.6656 | 0.1866 | 0.4245 | 0.8663 | 0.205 |
| concavity_worst | 0.7119 | 0.2416 | 0.4504 | 0.6869 | 0.4 |
| concave points_worst | 0.2654 | 0.186 | 0.243 | 0.2575 | 0.1625 |
| symmetry_worst | 0.4601 | 0.275 | 0.3613 | 0.6638 | 0.2364 |
| fractal_dimension_worst | 0.1189 | 0.08902 | 0.08758 | 0.173 | 0.07678 |
time: 4.94 s
In [3]:
from pyspark.sql.functions import when, lit
X = X.withColumn('diagnosis', when(X.diagnosis == 'B', lit(0)).otherwise(1))
time: 248 ms
In [4]:
X.groupby('diagnosis').count().toPandas()
Out[4]:
| diagnosis | count | |
|---|---|---|
| 0 | 1 | 212 |
| 1 | 0 | 357 |
time: 1.84 s
In [5]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = f.schema.names, outputCol = 'features')
X = vectorAssembler.transform(X)
time: 2.8 s
In [6]:
pd.DataFrame(X.take(5), columns=X.columns).transpose()
Out[6]:
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| diagnosis | 1 | 1 | 1 | 1 | 1 |
| radius_mean | 17.99 | 20.57 | 19.69 | 11.42 | 20.29 |
| texture_mean | 10.38 | 17.77 | 21.25 | 20.38 | 14.34 |
| perimeter_mean | 122.8 | 132.9 | 130 | 77.58 | 135.1 |
| area_mean | 1001 | 1326 | 1203 | 386.1 | 1297 |
| smoothness_mean | 0.1184 | 0.08474 | 0.1096 | 0.1425 | 0.1003 |
| compactness_mean | 0.2776 | 0.07864 | 0.1599 | 0.2839 | 0.1328 |
| concavity_mean | 0.3001 | 0.0869 | 0.1974 | 0.2414 | 0.198 |
| concave points_mean | 0.1471 | 0.07017 | 0.1279 | 0.1052 | 0.1043 |
| symmetry_mean | 0.2419 | 0.1812 | 0.2069 | 0.2597 | 0.1809 |
| fractal_dimension_mean | 0.07871 | 0.05667 | 0.05999 | 0.09744 | 0.05883 |
| radius_se | 1.095 | 0.5435 | 0.7456 | 0.4956 | 0.7572 |
| texture_se | 0.9053 | 0.7339 | 0.7869 | 1.156 | 0.7813 |
| perimeter_se | 8.589 | 3.398 | 4.585 | 3.445 | 5.438 |
| area_se | 153.4 | 74.08 | 94.03 | 27.23 | 94.44 |
| smoothness_se | 0.006399 | 0.005225 | 0.00615 | 0.00911 | 0.01149 |
| compactness_se | 0.04904 | 0.01308 | 0.04006 | 0.07458 | 0.02461 |
| concavity_se | 0.05373 | 0.0186 | 0.03832 | 0.05661 | 0.05688 |
| concave points_se | 0.01587 | 0.0134 | 0.02058 | 0.01867 | 0.01885 |
| symmetry_se | 0.03003 | 0.01389 | 0.0225 | 0.05963 | 0.01756 |
| fractal_dimension_se | 0.006193 | 0.003532 | 0.004571 | 0.009208 | 0.005115 |
| radius_worst | 25.38 | 24.99 | 23.57 | 14.91 | 22.54 |
| texture_worst | 17.33 | 23.41 | 25.53 | 26.5 | 16.67 |
| perimeter_worst | 184.6 | 158.8 | 152.5 | 98.87 | 152.2 |
| area_worst | 2019 | 1956 | 1709 | 567.7 | 1575 |
| smoothness_worst | 0.1622 | 0.1238 | 0.1444 | 0.2098 | 0.1374 |
| compactness_worst | 0.6656 | 0.1866 | 0.4245 | 0.8663 | 0.205 |
| concavity_worst | 0.7119 | 0.2416 | 0.4504 | 0.6869 | 0.4 |
| concave points_worst | 0.2654 | 0.186 | 0.243 | 0.2575 | 0.1625 |
| symmetry_worst | 0.4601 | 0.275 | 0.3613 | 0.6638 | 0.2364 |
| fractal_dimension_worst | 0.1189 | 0.08902 | 0.08758 | 0.173 | 0.07678 |
| features | [17.99, 10.38, 122.8, 1001.0, 0.1184, 0.2776, ... | [20.57, 17.77, 132.9, 1326.0, 0.08474, 0.07864... | [19.69, 21.25, 130.0, 1203.0, 0.1096, 0.1599, ... | [11.42, 20.38, 77.58, 386.1, 0.1425, 0.2839, 0... | [20.29, 14.34, 135.1, 1297.0, 0.1003, 0.1328, ... |
time: 444 ms
In [7]:
train, test = X.randomSplit([0.7, 0.3], seed = 42)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))
Training Dataset Count: 426 Test Dataset Count: 143 time: 857 ms
Logistic Regression¶
In [8]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# assembler = VectorAssembler(inputCols=[list_of_header_names],outputCol="features")
lr = LogisticRegression(featuresCol = 'features', labelCol = 'diagnosis')
lrModel = lr.fit(train)
predictions = lrModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Accuracy 0.9300699300699301 time: 11.2 s
Decission Tree¶
In [9]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="diagnosis", featuresCol="features")
dtModel = dt.fit(train)
predictions = dtModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Accuracy 0.965034965034965 time: 2.02 s
Naive Bayes¶
In [10]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="diagnosis", featuresCol="features")
nbModel = nb.fit(train)
predictions = nbModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
time: 888 ms
SVM¶
In [11]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC(labelCol="diagnosis", featuresCol="features")
svmModel = svm.fit(train)
predictions = svmModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Accuracy 0.958041958041958 time: 6.36 s
Subscribe to:
Comments (Atom)
Beneran
Testsfsasfss