In [1]:
from pyspark.sql import SparkSession
!pip install ipython-autotime
%load_ext autotime
from pyspark import SparkConf, SparkContext
spark = SparkSession.builder.appName('SVM-PSO').getOrCreate()
df = spark.read.csv('Dataset/breast_cancer.csv', header = True, inferSchema = True)
df.printSchema()
In [2]:
import pandas as pd
df = df.drop('id')
X = df.drop('_c32')
f = X.drop('diagnosis')
y = df['diagnosis']
# df.drop('age').collect()
pd.DataFrame(X.take(5), columns=X.columns).transpose()
Out[2]:
In [3]:
from pyspark.sql.functions import when, lit
X = X.withColumn('diagnosis', when(X.diagnosis == 'B', lit(0)).otherwise(1))
In [4]:
X.groupby('diagnosis').count().toPandas()
Out[4]:
In [5]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = f.schema.names, outputCol = 'features')
X = vectorAssembler.transform(X)
In [6]:
pd.DataFrame(X.take(5), columns=X.columns).transpose()
Out[6]:
In [7]:
train, test = X.randomSplit([0.7, 0.3], seed = 42)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))
Logistic Regression¶
In [8]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# assembler = VectorAssembler(inputCols=[list_of_header_names],outputCol="features")
lr = LogisticRegression(featuresCol = 'features', labelCol = 'diagnosis')
lrModel = lr.fit(train)
predictions = lrModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Decission Tree¶
In [9]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="diagnosis", featuresCol="features")
dtModel = dt.fit(train)
predictions = dtModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
Naive Bayes¶
In [10]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="diagnosis", featuresCol="features")
nbModel = nb.fit(train)
predictions = nbModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
SVM¶
In [11]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC(labelCol="diagnosis", featuresCol="features")
svmModel = svm.fit(train)
predictions = svmModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="diagnosis",metricName="accuracy")
print('Accuracy', evaluator.evaluate(predictions))
No comments:
Post a Comment