Naïve Bayes
Setting Up a Naïve Bayes Classifier
Load in required libraries
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
Initialize Naïve Bayes object
nb = NaiveBayes(labelCol="label", featuresCol="features")
Create a parameter grid for tuning the model
nbparamGrid = (ParamGridBuilder()
.addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
.build())
Define how you want the model to be evaluated
nbevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
Define the type of cross-validation you want to perform
# Create 5-fold CrossValidator
nbcv = CrossValidator(estimator = nb,
estimatorParamMaps = nbparamGrid,
evaluator = nbevaluator,
numFolds = 5)
Fit the model to the data
nbcvModel = nbcv.fit(train)
print(nbcvModel)
Score the testing dataset using your fitted model for evaluation purposes
nbpredictions = nbcvModel.transform(test)
Evaluate the model
print('Accuracy:', lrevaluator.evaluate(lrpredictions))
print('AUC:', BinaryClassificationMetrics(lrpredictions['label','prediction'].rdd).areaUnderROC)
print('PR:', BinaryClassificationMetrics(lrpredictions['label','prediction'].rdd).areaUnderPR)
Last updated
Was this helpful?