Gnus development mailing list
 help / color / mirror / Atom feed
From: Andrew Cohen <cohen@andy.bu.edu>
Subject: spam-stat.el and mime
Date: Sat, 10 Jan 2004 11:43:27 -0500	[thread overview]
Message-ID: <87u133g3f4.fsf@andy.bu.edu> (raw)

[-- Attachment #1: Type: text/plain, Size: 624 bytes --]

I've been using spam-stat.el for ages, but was unhappy that it only
had a success rate of about 97%. Checking a bit this was almost
entirely because it did no decoding of mime (or base64) encoded
articles. I've modified it to decode mime (if you don't like this it
can be controlled by customizing the spam-treat-mime-function to
nil). 

After retraining, I now have a false-positive rate of less than .08%
(no false positives on my test directory of 1300 ham emails) and a
success rate of detecting spam of about 99.8%, which is as good or
better than any of the other Bayesian filters I've played with. 

Diffs attached.


[-- Attachment #2: spam-stat.el.diff --]
[-- Type: text/plain, Size: 6985 bytes --]

*** spam-stat.el.orig	Sat Jan 10 11:33:03 2004
--- spam-stat.el	Sat Jan 10 11:33:25 2004
***************
*** 171,176 ****
--- 171,182 ----
    :type 'number
    :group 'spam-stat)
  
+ (defcustom spam-treat-mime-function 'spam-treat-mime
+   "Function to treat MIME articles."
+   :group 'spam-stat
+   :type 'function)
+ 
+ 
  (defvar spam-stat-syntax-table
    (let ((table (copy-syntax-table text-mode-syntax-table)))
      (modify-syntax-entry ?- "w" table)
***************
*** 461,466 ****
--- 467,473 ----
        (progn
  	(set-buffer spam-stat-buffer)
  	(goto-char (point-min))
+ 	(spam-treat-article)
  	(when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold)
  	  (when (boundp 'nnmail-split-trace)
  	    (mapc (lambda (entry)
***************
*** 485,490 ****
--- 492,498 ----
  	  (setq count (1+ count))
  	  (message "Reading %s: %.2f%%" dir (/ count max))
  	  (insert-file-contents f)
+ 	  (spam-treat-article)
  	  (funcall func)
  	  (erase-buffer))))))
  
***************
*** 503,519 ****
    (interactive)
    (hash-table-count spam-stat))
  
! (defun spam-stat-test-directory (dir)
    "Test all the regular files in directory DIR for spam.
  If the result is 1.0, then all files are considered spam.
! If the result is 0.0, non of the files is considered spam.
  You can use this to determine error rates."
    (interactive "D")
    (let* ((files (directory-files dir t "^[^.]"))
  	 (total (length files))
  	 (score 0.0); float
  	 (max (/ total 100.0)); float
! 	 (count 0))
      (with-temp-buffer
        (dolist (f files)
  	(when (and (file-readable-p f)
--- 511,531 ----
    (interactive)
    (hash-table-count spam-stat))
  
! (defun spam-stat-test-directory (dir &optional sp)
    "Test all the regular files in directory DIR for spam.
  If the result is 1.0, then all files are considered spam.
! If the result is 0.0, none of the files is considered spam.
! If SP eq spam, print the list of spam files. Otherwise if SP is non-nil
! print the non-spam files.
  You can use this to determine error rates."
    (interactive "D")
    (let* ((files (directory-files dir t "^[^.]"))
  	 (total (length files))
  	 (score 0.0); float
  	 (max (/ total 100.0)); float
! 	 (count 0)
! 	 (spamf '())
! 	 (nspamf '()))
      (with-temp-buffer
        (dolist (f files)
  	(when (and (file-readable-p f)
***************
*** 523,531 ****
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
  	  (insert-file-contents f)
  	  (when (> (spam-stat-score-buffer) 0.9)
! 	    (setq score (1+ score)))
  	  (erase-buffer))))
      (message "Final score: %d / %d = %f" score total (/ score total))))
  
  ;; Shrinking the dictionary
--- 535,548 ----
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
  	  (insert-file-contents f)
+ 	  (push f nspamf)
+ 	  (spam-treat-article)
  	  (when (> (spam-stat-score-buffer) 0.9)
! 	    (setq score (1+ score))
! 	    (pop nspamf)
! 	    (push f spamf))
  	  (erase-buffer))))
+     (when sp (print (if (equal sp "spam") spamf nspamf)))
      (message "Final score: %d / %d = %f" score total (/ score total))))
  
  ;; Shrinking the dictionary
***************
*** 562,567 ****
--- 579,685 ----
    (remove-hook 'gnus-select-article-hook
  	       'spam-stat-store-gnus-article-buffer))
  
+ (defun spam-treat-article ()
+   "Treat the current buffer prior to spam analysis."
+   (interactive)
+   (spam-decode)
+   (let ((gnus-inhibit-treatment t)
+ 	(gnus-treatment-function-alist nil)
+ 	(gnus-article-image-alist nil)
+ 	(gnus-article-wash-types nil)
+ 	(gnus-article-buffer (current-buffer))
+ 	(buffer-read-only nil))
+     (when spam-treat-mime-function
+       (funcall spam-treat-mime-function))))
+ 
+ (defun spam-treat-mime (&optional ihandles)
+   "Treat MIME parts."
+   (save-excursion
+     (save-selected-window
+       (let ((window (get-buffer-window gnus-article-buffer))
+ 	    (point (point)))
+ 	(when window
+ 	  (select-window window)
+ 	  ;; We have to do this since selecting the window
+ 	  ;; may change the point.  So we set the window point.
+ 	  (set-window-point window point)))
+       (let* ((handles (or ihandles
+ 			  (mm-dissect-buffer nil gnus-article-loose-mime)
+ 			  (and gnus-article-emulate-mime
+ 			       (mm-uu-dissect))))
+ 	     buffer-read-only handle name type b e display)
+ 	(when (and (not ihandles)
+ 		   (not gnus-displaying-mime))
+ 	  ;; Top-level call; we clean up.
+ 	  (when gnus-article-mime-handles
+ 	    (mm-destroy-parts gnus-article-mime-handles)
+ 	    (setq gnus-article-mime-handle-alist nil));; A trick.
+ 	  (setq gnus-article-mime-handles handles)
+ 	  ;; We allow users to glean info from the handles.
+ 	  (when gnus-article-mime-part-function
+ 	    (gnus-mime-part-function handles)))
+ 	(if (and handles
+ 		 (or (not (stringp (car handles)))
+ 		     (cdr handles)))
+ 	    (progn
+ 	      (when (and (not ihandles)
+ 			 (not gnus-displaying-mime))
+ 		;; Clean up for mime parts.
+ 		(article-goto-body)
+ 		(delete-region (point) (point-max)))
+ 	      (let ((gnus-displaying-mime t))
+ 		(spam-treat-parts handles))))))))
+ 
+ (defun spam-treat-parts (handle)
+   (if (stringp (car handle))
+       (mapcar 'spam-treat-parts (cdr handle))
+     (if (bufferp (car handle))
+ 	(save-restriction
+ 	  (narrow-to-region (point) (point))
+ 	(when (let ((case-fold-search t))
+ 		(string-match "text"    (car (mm-handle-type handle))))
+ 	  (mm-insert-part handle))
+ 	  (goto-char (point-max)))
+       (mapcar 'spam-treat-parts handle))))
+ 
+ 
+ (defun spam-decode ()
+   "Translate a quoted-printable-encoded or base64 article."
+   (interactive)
+   (save-excursion
+     (let ((buffer-read-only nil) type charset)
+       (if (gnus-buffer-live-p (current-buffer))
+ 	  (with-current-buffer (current-buffer)
+ 	    (setq type
+ 		  (gnus-fetch-field "content-transfer-encoding"))
+ 	    (let* ((ct (gnus-fetch-field "content-type"))
+ 		   (ctl (and ct
+ 			     (ignore-errors
+ 			       (mail-header-parse-content-type ct)))))
+ 	      (setq charset (and ctl
+ 				 (mail-content-type-get ctl 'charset)))
+ 	      (if (stringp charset)
+ 		  (setq charset (intern (downcase charset)))))))
+      (unless charset
+ 	(setq charset gnus-newsgroup-charset))
+       (if (and type (let ((case-fold-search t))
+ 		      (string-match "quoted-printable" type)))
+ 	  (progn
+ 	    (article-goto-body)
+ 	    (quoted-printable-decode-region
+ 	     (point) (point-max) (mm-charset-to-coding-system charset)))
+ 	(progn
+ 	  (when (and type (let ((case-fold-search t))
+ 			    (string-match "base64" type)))
+ 	    (article-goto-body)
+ 	    (save-restriction
+ 	      (narrow-to-region (point) (point-max))
+ 	      (ignore-errors (base64-decode-region (point-min) (point-max))
+ 			     (mm-decode-coding-region
+ 			      (point-min) (point-max) 
+ 			      (mm-charset-to-coding-system charset))))))))))
+ 
+ 
  (provide 'spam-stat)
  
  ;;; spam-stat.el ends here

[-- Attachment #3: Type: text/plain, Size: 21 bytes --]



Regards,
Andy






             reply	other threads:[~2004-01-10 16:43 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-01-10 16:43 Andrew Cohen [this message]
2004-01-11 20:37 ` Adam Sjøgren
2004-01-12  4:47 ` Jesper Harder
2004-01-12 21:37 ` Ted Zlatanov
2004-01-13 19:42 ` Adam Sjøgren
2004-01-20  5:56 ` Jesper Harder
2004-01-21  0:17   ` Ted Zlatanov
2004-01-21 20:41   ` Adam Sjøgren
2004-01-22  7:30     ` Jesper Harder
2004-01-22 13:49       ` Reiner Steib
2004-01-23  1:15       ` Jesper Harder

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87u133g3f4.fsf@andy.bu.edu \
    --to=cohen@andy.bu.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).