myspam.m 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. %% Initialization
  2. clear ; close all; clc
  3. %% ==================== Part 1: Email Preprocessing ====================
  4. % To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
  5. % to convert each email into a vector of features. In this part, you will
  6. % implement the preprocessing steps for each email. You should
  7. % complete the code in processEmail.m to produce a word indices vector
  8. % for a given email.
  9. fprintf('\nPreprocessing sample email (emailSample1.txt)\n');
  10. % Extract Features
  11. file_contents = readFile('emailSample1.txt');
  12. word_indices = processEmail(file_contents);
  13. length(unique(word_indices))
  14. % Print Stats
  15. fprintf('Word Indices: \n');
  16. fprintf(' %d ', word_indices);
  17. fprintf('\n\n');
  18. fprintf ('num entries in word_indices: %d\n', length(word_indices));
  19. fprintf('Program paused. Press enter to continue.\n');
  20. %% ==================== Part 2: Feature Extraction ====================
  21. % Now, you will convert each email into a vector of features in R^n.
  22. % You should complete the code in emailFeatures.m to produce a feature
  23. % vector for a given email.
  24. fprintf('\nExtracting features from sample email (emailSample1.txt)\n');
  25. % Extract Features
  26. file_contents = readFile('emailSample1.txt');
  27. word_indices = processEmail(file_contents);
  28. features = emailFeatures(word_indices);
  29. % Print Stats
  30. fprintf('Length of feature vector: %d\n', length(features));
  31. fprintf('Number of non-zero entries: %d\n', sum(features > 0));
  32. fprintf('Program paused. Press enter to continue.\n');
  33. %% =========== Part 3: Train Linear SVM for Spam Classification ========
  34. % In this section, you will train a linear classifier to determine if an
  35. % email is Spam or Not-Spam.
  36. % Load the Spam Email dataset
  37. % You will have X, y in your environment
  38. load('spamTrain.mat');
  39. fprintf('\nTraining Linear SVM (Spam Classification)\n')
  40. fprintf('(this may take 1 to 2 minutes) ...\n')
  41. C = 0.1;
  42. model = svmTrain(X, y, C, @linearKernel);
  43. p = svmPredict(model, X);
  44. fprintf('Training Accuracy: %f\n', mean(double(p == y)) * 100);
  45. %% =================== Part 4: Test Spam Classification ================
  46. % After training the classifier, we can evaluate it on a test set. We have
  47. % included a test set in spamTest.mat
  48. % Load the test dataset
  49. % You will have Xtest, ytest in your environment
  50. load('spamTest.mat');
  51. fprintf('\nEvaluating the trained Linear SVM on a test set ...\n')
  52. p = svmPredict(model, Xtest);
  53. fprintf('Test Accuracy: %f\n', mean(double(p == ytest)) * 100);
  54. pause;
  55. %% ================= Part 5: Top Predictors of Spam ====================
  56. % Since the model we are training is a linear SVM, we can inspect the
  57. % weights learned by the model to understand better how it is determining
  58. % whether an email is spam or not. The following code finds the words with
  59. % the highest weights in the classifier. Informally, the classifier
  60. % 'thinks' that these words are the most likely indicators of spam.
  61. %
  62. % Sort the weights and obtin the vocabulary list
  63. [weight, idx] = sort(model.w, 'descend');
  64. vocabList = getVocabList();
  65. fprintf('\nTop predictors of spam: \n');
  66. for i = 1:15
  67. fprintf(' %-15s (%f) \n', vocabList{idx(i)}, weight(i));
  68. end
  69. fprintf('\n\n');
  70. fprintf('\nProgram paused. Press enter to continue.\n');
  71. %% =================== Part 6: Try Your Own Emails =====================
  72. % Now that you've trained the spam classifier, you can use it on your own
  73. % emails! In the starter code, we have included spamSample1.txt,
  74. % spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
  75. % The following code reads in one of these emails and then uses your
  76. % learned SVM classifier to determine whether the email is Spam or
  77. % Not Spam
  78. % Set the file to be read in (change this to spamSample2.txt,
  79. % emailSample1.txt or emailSample2.txt to see different predictions on
  80. % different emails types). Try your own emails as well!
  81. fname = 'spamSample#.txt';
  82. for i = 1:5
  83. filename = strrep(fname, '#', num2str(i))
  84. file_contents = readFile(filename);
  85. word_indices = processEmail(file_contents);
  86. x = emailFeatures(word_indices);
  87. p = svmPredict(model, x);
  88. fprintf('\nProcessed %s\n\nSpam Classification: %d\n', filename, p);
  89. fprintf('(1 indicates spam, 0 indicates not spam)\n\n');
  90. endfor