Feature Importance

Extract important features using Gini

1
## Based on: https://www.timlrx.com/2018/06/19/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator/
2
import pandas as pd
3
4
def ExtractFeatureImportance(featureImp, dataset, featuresCol):
5
list_extract = []
6
for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
7
list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
8
varlist = pd.DataFrame(list_extract)
9
varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
10
return(varlist.sort_values('score', ascending = False))
11
12
13
# ExtractFeatureImportance(model.stages[-1].featureImportances, dataset, "features")
14
dataset_fi = ExtractFeatureImportance(model.bestModel.featureImportances, dataset, "features")
15
dataset_fi = sqlContext.createDataFrame(dataset_fi)
16
display(dataset_fi)
Copied!

Extract important features using p-values

1
## Based on: https://stackoverflow.com/questions/42935914/how-to-map-features-from-the-output-of-a-vectorassembler-back-to-the-column-name
2
lrm = model.stages[-1]
3
## Transform the data:
4
transformed = model.transform(df)
5
#Extract and flatten ML attributes:
6
from itertools import chain
7
8
attrs = sorted(
9
(attr["idx"], attr["name"]) for attr in (chain(*transformed
10
.schema[lrm.summary.featuresCol]
11
.metadata["ml_attr"]["attrs"].values())))
12
# and map to the output:
13
14
[(name, lrm.summary.pValues[idx]) for idx, name in attrs]
15
# [(name, lrm.coefficients[idx]) for idx, name in attrs]
Copied!

Extract coefficients from a model

1
import pandas as pd
2
3
featurelist = pd.DataFrame(dataset.schema["features"].metadata["ml_attr"]["attrs"]["binary"]+dataset.schema["features"].metadata["ml_attr"]["attrs"]["numeric"]).sort_values("idx")
4
featurelist["Coefficient"] = pd.DataFrame(model.bestModel.coefficients.toArray())
5
featurelist = sqlContext.createDataFrame(featurelist)
6
7
display(featurelist)
Copied!