nb_test.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. from pyspark import SparkContext
  2. from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
  3. from pyspark.mllib.linalg import Vectors
  4. from pyspark.mllib.regression import LabeledPoint
  5. def parseLine(line):
  6. parts = line.split(',')
  7. label = float(parts[0])
  8. features = Vectors.dense([float(x) for x in parts[1].split(' ')])
  9. return LabeledPoint(label, features)
  10. def train():
  11. sc = SparkContext(appName= 'nb_test')
  12. data = sc.textFile('../../data/mllib/sample_naive_bayes_data.txt').map(parseLine)
  13. # Split data aproximately into training (60%) and test (40%)
  14. training, test = data.randomSplit([0.6, 0.4], seed=0)
  15. print training.collect()
  16. # Train a naive Bayes model.
  17. model = NaiveBayes.train(training, 1.0)
  18. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
  19. accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
  20. print accuracy
  21. # Save and load model
  22. #model.save(sc, "../../target/myNaiveBayesModel")
  23. def predict():
  24. # Make prediction and test accuracy.
  25. sc = SparkContext(appName= 'nb_test')
  26. sameModel = NaiveBayesModel.load(sc, "../../target/myNaiveBayesModel")
  27. data = sc.textFile('../../data/mllib/sample_naive_bayes_data.txt').map(parseLine)
  28. # Split data aproximately into training (60%) and test (40%)
  29. training, test = data.randomSplit([0.1, 0.9], seed=0)
  30. print test.collect()
  31. predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
  32. print predictionAndLabel.collect()
  33. accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
  34. print accuracy
  35. if __name__ == '__main__':
  36. predict()