Gnus development mailing list
 help / color / mirror / Atom feed
* spam-stat.el and mime
@ 2004-01-10 16:43 Andrew Cohen
  2004-01-11 20:37 ` Adam Sjøgren
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Andrew Cohen @ 2004-01-10 16:43 UTC (permalink / raw)


[-- Attachment #1: Type: text/plain, Size: 624 bytes --]

I've been using spam-stat.el for ages, but was unhappy that it only
had a success rate of about 97%. Checking a bit this was almost
entirely because it did no decoding of mime (or base64) encoded
articles. I've modified it to decode mime (if you don't like this it
can be controlled by customizing the spam-treat-mime-function to
nil). 

After retraining, I now have a false-positive rate of less than .08%
(no false positives on my test directory of 1300 ham emails) and a
success rate of detecting spam of about 99.8%, which is as good or
better than any of the other Bayesian filters I've played with. 

Diffs attached.


[-- Attachment #2: spam-stat.el.diff --]
[-- Type: text/plain, Size: 6985 bytes --]

*** spam-stat.el.orig	Sat Jan 10 11:33:03 2004
--- spam-stat.el	Sat Jan 10 11:33:25 2004
***************
*** 171,176 ****
--- 171,182 ----
    :type 'number
    :group 'spam-stat)
  
+ (defcustom spam-treat-mime-function 'spam-treat-mime
+   "Function to treat MIME articles."
+   :group 'spam-stat
+   :type 'function)
+ 
+ 
  (defvar spam-stat-syntax-table
    (let ((table (copy-syntax-table text-mode-syntax-table)))
      (modify-syntax-entry ?- "w" table)
***************
*** 461,466 ****
--- 467,473 ----
        (progn
  	(set-buffer spam-stat-buffer)
  	(goto-char (point-min))
+ 	(spam-treat-article)
  	(when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold)
  	  (when (boundp 'nnmail-split-trace)
  	    (mapc (lambda (entry)
***************
*** 485,490 ****
--- 492,498 ----
  	  (setq count (1+ count))
  	  (message "Reading %s: %.2f%%" dir (/ count max))
  	  (insert-file-contents f)
+ 	  (spam-treat-article)
  	  (funcall func)
  	  (erase-buffer))))))
  
***************
*** 503,519 ****
    (interactive)
    (hash-table-count spam-stat))
  
! (defun spam-stat-test-directory (dir)
    "Test all the regular files in directory DIR for spam.
  If the result is 1.0, then all files are considered spam.
! If the result is 0.0, non of the files is considered spam.
  You can use this to determine error rates."
    (interactive "D")
    (let* ((files (directory-files dir t "^[^.]"))
  	 (total (length files))
  	 (score 0.0); float
  	 (max (/ total 100.0)); float
! 	 (count 0))
      (with-temp-buffer
        (dolist (f files)
  	(when (and (file-readable-p f)
--- 511,531 ----
    (interactive)
    (hash-table-count spam-stat))
  
! (defun spam-stat-test-directory (dir &optional sp)
    "Test all the regular files in directory DIR for spam.
  If the result is 1.0, then all files are considered spam.
! If the result is 0.0, none of the files is considered spam.
! If SP eq spam, print the list of spam files. Otherwise if SP is non-nil
! print the non-spam files.
  You can use this to determine error rates."
    (interactive "D")
    (let* ((files (directory-files dir t "^[^.]"))
  	 (total (length files))
  	 (score 0.0); float
  	 (max (/ total 100.0)); float
! 	 (count 0)
! 	 (spamf '())
! 	 (nspamf '()))
      (with-temp-buffer
        (dolist (f files)
  	(when (and (file-readable-p f)
***************
*** 523,531 ****
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
  	  (insert-file-contents f)
  	  (when (> (spam-stat-score-buffer) 0.9)
! 	    (setq score (1+ score)))
  	  (erase-buffer))))
      (message "Final score: %d / %d = %f" score total (/ score total))))
  
  ;; Shrinking the dictionary
--- 535,548 ----
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
  	  (insert-file-contents f)
+ 	  (push f nspamf)
+ 	  (spam-treat-article)
  	  (when (> (spam-stat-score-buffer) 0.9)
! 	    (setq score (1+ score))
! 	    (pop nspamf)
! 	    (push f spamf))
  	  (erase-buffer))))
+     (when sp (print (if (equal sp "spam") spamf nspamf)))
      (message "Final score: %d / %d = %f" score total (/ score total))))
  
  ;; Shrinking the dictionary
***************
*** 562,567 ****
--- 579,685 ----
    (remove-hook 'gnus-select-article-hook
  	       'spam-stat-store-gnus-article-buffer))
  
+ (defun spam-treat-article ()
+   "Treat the current buffer prior to spam analysis."
+   (interactive)
+   (spam-decode)
+   (let ((gnus-inhibit-treatment t)
+ 	(gnus-treatment-function-alist nil)
+ 	(gnus-article-image-alist nil)
+ 	(gnus-article-wash-types nil)
+ 	(gnus-article-buffer (current-buffer))
+ 	(buffer-read-only nil))
+     (when spam-treat-mime-function
+       (funcall spam-treat-mime-function))))
+ 
+ (defun spam-treat-mime (&optional ihandles)
+   "Treat MIME parts."
+   (save-excursion
+     (save-selected-window
+       (let ((window (get-buffer-window gnus-article-buffer))
+ 	    (point (point)))
+ 	(when window
+ 	  (select-window window)
+ 	  ;; We have to do this since selecting the window
+ 	  ;; may change the point.  So we set the window point.
+ 	  (set-window-point window point)))
+       (let* ((handles (or ihandles
+ 			  (mm-dissect-buffer nil gnus-article-loose-mime)
+ 			  (and gnus-article-emulate-mime
+ 			       (mm-uu-dissect))))
+ 	     buffer-read-only handle name type b e display)
+ 	(when (and (not ihandles)
+ 		   (not gnus-displaying-mime))
+ 	  ;; Top-level call; we clean up.
+ 	  (when gnus-article-mime-handles
+ 	    (mm-destroy-parts gnus-article-mime-handles)
+ 	    (setq gnus-article-mime-handle-alist nil));; A trick.
+ 	  (setq gnus-article-mime-handles handles)
+ 	  ;; We allow users to glean info from the handles.
+ 	  (when gnus-article-mime-part-function
+ 	    (gnus-mime-part-function handles)))
+ 	(if (and handles
+ 		 (or (not (stringp (car handles)))
+ 		     (cdr handles)))
+ 	    (progn
+ 	      (when (and (not ihandles)
+ 			 (not gnus-displaying-mime))
+ 		;; Clean up for mime parts.
+ 		(article-goto-body)
+ 		(delete-region (point) (point-max)))
+ 	      (let ((gnus-displaying-mime t))
+ 		(spam-treat-parts handles))))))))
+ 
+ (defun spam-treat-parts (handle)
+   (if (stringp (car handle))
+       (mapcar 'spam-treat-parts (cdr handle))
+     (if (bufferp (car handle))
+ 	(save-restriction
+ 	  (narrow-to-region (point) (point))
+ 	(when (let ((case-fold-search t))
+ 		(string-match "text"    (car (mm-handle-type handle))))
+ 	  (mm-insert-part handle))
+ 	  (goto-char (point-max)))
+       (mapcar 'spam-treat-parts handle))))
+ 
+ 
+ (defun spam-decode ()
+   "Translate a quoted-printable-encoded or base64 article."
+   (interactive)
+   (save-excursion
+     (let ((buffer-read-only nil) type charset)
+       (if (gnus-buffer-live-p (current-buffer))
+ 	  (with-current-buffer (current-buffer)
+ 	    (setq type
+ 		  (gnus-fetch-field "content-transfer-encoding"))
+ 	    (let* ((ct (gnus-fetch-field "content-type"))
+ 		   (ctl (and ct
+ 			     (ignore-errors
+ 			       (mail-header-parse-content-type ct)))))
+ 	      (setq charset (and ctl
+ 				 (mail-content-type-get ctl 'charset)))
+ 	      (if (stringp charset)
+ 		  (setq charset (intern (downcase charset)))))))
+      (unless charset
+ 	(setq charset gnus-newsgroup-charset))
+       (if (and type (let ((case-fold-search t))
+ 		      (string-match "quoted-printable" type)))
+ 	  (progn
+ 	    (article-goto-body)
+ 	    (quoted-printable-decode-region
+ 	     (point) (point-max) (mm-charset-to-coding-system charset)))
+ 	(progn
+ 	  (when (and type (let ((case-fold-search t))
+ 			    (string-match "base64" type)))
+ 	    (article-goto-body)
+ 	    (save-restriction
+ 	      (narrow-to-region (point) (point-max))
+ 	      (ignore-errors (base64-decode-region (point-min) (point-max))
+ 			     (mm-decode-coding-region
+ 			      (point-min) (point-max) 
+ 			      (mm-charset-to-coding-system charset))))))))))
+ 
+ 
  (provide 'spam-stat)
  
  ;;; spam-stat.el ends here

[-- Attachment #3: Type: text/plain, Size: 21 bytes --]



Regards,
Andy






^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-10 16:43 spam-stat.el and mime Andrew Cohen
@ 2004-01-11 20:37 ` Adam Sjøgren
  2004-01-12  4:47 ` Jesper Harder
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Adam Sjøgren @ 2004-01-11 20:37 UTC (permalink / raw)


On Sat, 10 Jan 2004 11:43:27 -0500, Andrew wrote:

> I've been using spam-stat.el for ages, but was unhappy that it only
> had a success rate of about 97%. Checking a bit this was almost
> entirely because it did no decoding of mime (or base64) encoded
> articles. I've modified it to decode mime (if you don't like this it
> can be controlled by customizing the spam-treat-mime-function to
> nil).

Very cool!

I've applied your patch to my spam-stat.el (I used an ugly hack
calling an external perl-script to do the un-base64'ing previously)
and it looks like it works nicely here.

When splitting mail I've begun to get messages like this in
*Message-Log*:

 Error while decoding: (error Premature EOF while decoding base64)

and:

 Malformed quoted-printable text

I'm wondering whether that's only natural or if it could indicate some
problem?

(I can try and track down some of the 'offending' spam if that can
help).


  Best regards,

-- 
 "Q: Are you happy?                                           Adam Sjøgren
  A: Yes. As an ashtray maybe."                          asjo@koldfront.dk




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-10 16:43 spam-stat.el and mime Andrew Cohen
  2004-01-11 20:37 ` Adam Sjøgren
@ 2004-01-12  4:47 ` Jesper Harder
  2004-01-12 21:37 ` Ted Zlatanov
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Jesper Harder @ 2004-01-12  4:47 UTC (permalink / raw)


Andrew Cohen <cohen@andy.bu.edu> writes:

> I've modified it to decode mime (if you don't like this it can be
> controlled by customizing the spam-treat-mime-function to nil).

An often requested feature!  Below a few pedantic comments on the
code:

> + 	  ;; We allow users to glean info from the handles.
> + 	  (when gnus-article-mime-part-function
> + 	    (gnus-mime-part-function handles)))

If someone is using this feature, they probably don't want to run
`gnus-article-mime-part-function' when processing spam.

> +       (if (gnus-buffer-live-p (current-buffer))

(current-buffer) always returns a live buffer, so you don't need this
line.

> + 	  (with-current-buffer (current-buffer)

This line should also be redundant.

> +       (if (and type (let ((case-fold-search t))
> + 		      (string-match "quoted-printable" type)))
> + 	  (progn
> + 	    (article-goto-body)
> + 	    (quoted-printable-decode-region
> + 	     (point) (point-max) (mm-charset-to-coding-system charset)))
> + 	(progn
> + 	  (when (and type (let ((case-fold-search t))

You don't need a progn around the ELSE part of an `if' in Emacs Lisp
(in contrast with Common Lisp).




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-10 16:43 spam-stat.el and mime Andrew Cohen
  2004-01-11 20:37 ` Adam Sjøgren
  2004-01-12  4:47 ` Jesper Harder
@ 2004-01-12 21:37 ` Ted Zlatanov
  2004-01-13 19:42 ` Adam Sjøgren
  2004-01-20  5:56 ` Jesper Harder
  4 siblings, 0 replies; 11+ messages in thread
From: Ted Zlatanov @ 2004-01-12 21:37 UTC (permalink / raw)
  Cc: ding

On Sat, 10 Jan 2004, cohen@andy.bu.edu wrote:

> I've been using spam-stat.el for ages, but was unhappy that it only
> had a success rate of about 97%. Checking a bit this was almost
> entirely because it did no decoding of mime (or base64) encoded
> articles. I've modified it to decode mime (if you don't like this it
> can be controlled by customizing the spam-treat-mime-function to
> nil). 
> 
> After retraining, I now have a false-positive rate of less than .08%
> (no false positives on my test directory of 1300 ham emails) and a
> success rate of detecting spam of about 99.8%, which is as good or
> better than any of the other Bayesian filters I've played with. 

That's very cool, but shouldn't it go into spam.el so other backends
besides spam-stat can use it?

If you can incorporate Jesper's suggestions into your code, you're
welcome to make a patch to put the code in spam.el or let me do
it.  This is definitely a good feature for the Gnus users.

Thanks
Ted



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-10 16:43 spam-stat.el and mime Andrew Cohen
                   ` (2 preceding siblings ...)
  2004-01-12 21:37 ` Ted Zlatanov
@ 2004-01-13 19:42 ` Adam Sjøgren
  2004-01-20  5:56 ` Jesper Harder
  4 siblings, 0 replies; 11+ messages in thread
From: Adam Sjøgren @ 2004-01-13 19:42 UTC (permalink / raw)


[-- Attachment #1: Type: text/plain, Size: 1076 bytes --]

I just re-trained spam-stat on my box, and couldn't (spam-stat-load)
afterwards.

The error I got when starting, as ~/.spam-stat.el was being read was:
Invalid read syntax: "."

After trying to edit the file (12457514 bytes on one line is not
XEmacs cup-o-tea), I gave up and replaced all occurences of )( with
)\n( and grepped for lines with three "'s in them (hunch). Yielding:

 13587:("A"1YGO0m" 0 4)
 168855:("5?=CA"<S" 0 1)
 183068:(";}0"GU4O4Y." 0 1)
 212714:("A"8qGR<v" 0 4)
 317258:("0"A>" 0 9)
 569537:("0m9_A"<vC3" 0 1)
 667177:("AwA"" 0 6)

Aha! These lines were a little tricky to find in the file without the
linenumbers, because there was escape-chars in them that the xterm
ate.

Copying the first line from the buffer to another got me this:

 ("접근하고" 0 4)

Suggesting, perhaps, that it's an encoding issue of some sort?

I'm using No Gnus v0.1 on XEmacs 21.4 (patch 14) "Reasonable
Discussion" [Lucid] (i386-debian-linux, Mule) of Sat Sep 27 2003 on
eeyore, with the spam-stat.el patch of this thread.


  Best regards,

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]


-- 
 "Relax. Stupidity produces antibodies."                      Adam Sjøgren
 "Air filter! Air filter!"                               asjo@koldfront.dk

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-10 16:43 spam-stat.el and mime Andrew Cohen
                   ` (3 preceding siblings ...)
  2004-01-13 19:42 ` Adam Sjøgren
@ 2004-01-20  5:56 ` Jesper Harder
  2004-01-21  0:17   ` Ted Zlatanov
  2004-01-21 20:41   ` Adam Sjøgren
  4 siblings, 2 replies; 11+ messages in thread
From: Jesper Harder @ 2004-01-20  5:56 UTC (permalink / raw)


[-- Attachment #1: Type: text/plain, Size: 873 bytes --]

Andrew Cohen <cohen@andy.bu.edu> writes:

> Checking a bit this was almost entirely because it did no decoding
> of mime (or base64) encoded articles. I've modified it to decode
> mime (if you don't like this it can be controlled by customizing the
> spam-treat-mime-function to nil).

I looked a bit more at it:

+ (defun spam-treat-article ()
+   "Treat the current buffer prior to spam analysis."
+   (interactive)
+   (spam-decode)
    ^^^^^^^^^^^^^

It doesn't work to call `spam-decode' here -- you have to decode each
MIME part separately.

It's not so easy to use the existing MIME parsing functions in Gnus
for this purpose.  They were written with display in mind, and they're
also very slow because they do a lot of fancy stuff, which is
unnecessary in this context.

Please try the attached code (it increased my spam recognition rate by
4 percentage points).


[-- Attachment #2: spamwash.el --]
[-- Type: application/emacs-lisp, Size: 2340 bytes --]

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: Type: text/x-patch, Size: 4714 bytes --]

*** /home/harder/gnus/lisp/spam-stat.el	Mon Jan  5 20:12:20 2004
--- /home/harder/cvsgnus/lisp/spam-stat.el	Tue Jan 20 06:54:45 2004
***************
*** 122,127 ****
--- 122,128 ----
  \f
  
  ;;; Code:
+ (require 'spamwash)
  
  (defgroup spam-stat nil
    "Statistical spam detection for Emacs.
***************
*** 171,176 ****
--- 172,182 ----
    :type 'number
    :group 'spam-stat)
  
+ (defcustom spam-stat-washing-hook '(spamwash)
+   "Hook applied to each message before analysis."
+   :type 'hook
+   :group 'spam-stat)
+ 
  (defvar spam-stat-syntax-table
    (let ((table (copy-syntax-table text-mode-syntax-table)))
      (modify-syntax-entry ?- "w" table)
***************
*** 291,296 ****
--- 297,303 ----
  
  (defun spam-stat-buffer-words ()
    "Return a hash table of words and number of occurences in the buffer."
+   (run-hooks 'spam-stat-washing-hook)
    (with-spam-stat-max-buffer-size
     (with-syntax-table spam-stat-syntax-table
       (goto-char (point-min))
***************
*** 369,395 ****
    "Save the `spam-stat' hash table as lisp file."
    (interactive)
    (when (or force spam-stat-dirty)
!     (with-temp-buffer
!       (let ((standard-output (current-buffer))
! 	    (font-lock-maximum-size 0))
! 	(insert "(setq spam-stat-ngood "
! 		(number-to-string spam-stat-ngood)
! 		" spam-stat-nbad "
! 		(number-to-string spam-stat-nbad)
! 		" spam-stat (spam-stat-to-hash-table '(")
! 	(maphash (lambda (word entry)
! 		   (prin1 (list word
! 				(spam-stat-good entry)
! 				(spam-stat-bad entry))))
! 		 spam-stat)
! 	(insert ")))")
! 	(write-file spam-stat-file)))
      (setq spam-stat-dirty nil)))
  
  (defun spam-stat-load ()
    "Read the `spam-stat' hash table from disk."
    ;; TODO: maybe we should warn the user if spam-stat-dirty is t?
!   (load-file spam-stat-file)
    (setq spam-stat-dirty nil))
  
  (defun spam-stat-to-hash-table (entries)
--- 376,404 ----
    "Save the `spam-stat' hash table as lisp file."
    (interactive)
    (when (or force spam-stat-dirty)
!     (let ((coding-system-for-write 'emacs-mule))
!       (with-temp-file spam-stat-file
! 	  (let ((standard-output (current-buffer))
! 		(font-lock-maximum-size 0))
! 	    (insert ";-*- coding: emacs-mule; -*-\n")
! 	    (insert "(setq spam-stat-ngood "
! 		    (number-to-string spam-stat-ngood)
! 		    " spam-stat-nbad "
! 		    (number-to-string spam-stat-nbad)
! 		    " spam-stat (spam-stat-to-hash-table '(")
! 	    (maphash (lambda (word entry)
! 		       (prin1 (list word
! 				    (spam-stat-good entry)
! 				    (spam-stat-bad entry))))
! 		     spam-stat)
! 	    (insert ")))"))))
      (setq spam-stat-dirty nil)))
  
  (defun spam-stat-load ()
    "Read the `spam-stat' hash table from disk."
    ;; TODO: maybe we should warn the user if spam-stat-dirty is t?
!   (let ((coding-system-for-read 'emacs-mule))
!     (load-file spam-stat-file))
    (setq spam-stat-dirty nil))
  
  (defun spam-stat-to-hash-table (entries)
***************
*** 399,405 ****
  NBAD is the number of bad mails it has appeared in, GOOD is the number
  of times it appeared in good mails, and BAD is the number of times it
  has appeared in bad mails."
!   (let ((table (make-hash-table :test 'equal)))
      (mapc (lambda (l)
  	    (puthash (car l)
  		     (spam-stat-make-entry (nth 1 l) (nth 2 l))
--- 408,414 ----
  NBAD is the number of bad mails it has appeared in, GOOD is the number
  of times it appeared in good mails, and BAD is the number of times it
  has appeared in bad mails."
!   (let ((table (make-hash-table :size (length entries) :test 'equal)))
      (mapc (lambda (l)
  	    (puthash (car l)
  		     (spam-stat-make-entry (nth 1 l) (nth 2 l))
***************
*** 484,490 ****
                     (> (nth 7 (file-attributes f)) 0))
  	  (setq count (1+ count))
  	  (message "Reading %s: %.2f%%" dir (/ count max))
! 	  (insert-file-contents f)
  	  (funcall func)
  	  (erase-buffer))))))
  
--- 493,499 ----
                     (> (nth 7 (file-attributes f)) 0))
  	  (setq count (1+ count))
  	  (message "Reading %s: %.2f%%" dir (/ count max))
! 	  (insert-file-contents-literally f)
  	  (funcall func)
  	  (erase-buffer))))))
  
***************
*** 522,528 ****
  	  (setq count (1+ count))
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
! 	  (insert-file-contents f)
  	  (when (> (spam-stat-score-buffer) 0.9)
  	    (setq score (1+ score)))
  	  (erase-buffer))))
--- 531,537 ----
  	  (setq count (1+ count))
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
! 	  (insert-file-contents-literally f)
  	  (when (> (spam-stat-score-buffer) 0.9)
  	    (setq score (1+ score)))
  	  (erase-buffer))))

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-20  5:56 ` Jesper Harder
@ 2004-01-21  0:17   ` Ted Zlatanov
  2004-01-21 20:41   ` Adam Sjøgren
  1 sibling, 0 replies; 11+ messages in thread
From: Ted Zlatanov @ 2004-01-21  0:17 UTC (permalink / raw)


On Tue, 20 Jan 2004, harder@ifa.au.dk wrote:

> Please try the attached code (it increased my spam recognition rate
> by 4 percentage points).

Jesper,

other than renaming to spam-wash.el (to be consistent with spam-stat
and spam-report), your code looks good.  Has anyone else tried it?
I'd like to use it, since you say it's much faster than Andrew's
original code.

Thanks
Ted



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-20  5:56 ` Jesper Harder
  2004-01-21  0:17   ` Ted Zlatanov
@ 2004-01-21 20:41   ` Adam Sjøgren
  2004-01-22  7:30     ` Jesper Harder
  1 sibling, 1 reply; 11+ messages in thread
From: Adam Sjøgren @ 2004-01-21 20:41 UTC (permalink / raw)


On Tue, 20 Jan 2004 06:56:17 +0100, Jesper wrote:

> Please try the attached code (it increased my spam recognition rate
> by 4 percentage points).

My results are:

 Jesper Harder (spamwash.el+patch): 0.995423
 Original spam-stat.el (ngnus-0.1): 0.995664
 Andrew Cohen (patch)             : 0.997591

Each time I installing the version to be tested, training and then ran
spam-stat-test-directory on a spam-group with 4151 emails in it.

I noticed that Andrew Cohens version was the only one that bothered to
decrypt the 300 GnuPG encrypted ham-messages that are a part of my
ham-collection during training. Maybe that could account for the
difference? Maybe not...

I'm amazed at the originals fare in this try, previously my feeling
was that not handling base64 made things much worse.

Maybe it's because I've moved 28000 emails out of my spam-dir
recently...


  Best regards,

-- 
 "hur gör ni alla coola gubbar"                               Adam Sjøgren
                                                         asjo@koldfront.dk




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-21 20:41   ` Adam Sjøgren
@ 2004-01-22  7:30     ` Jesper Harder
  2004-01-22 13:49       ` Reiner Steib
  2004-01-23  1:15       ` Jesper Harder
  0 siblings, 2 replies; 11+ messages in thread
From: Jesper Harder @ 2004-01-22  7:30 UTC (permalink / raw)


[-- Attachment #1: Type: text/plain, Size: 2299 bytes --]

spamtrap@koldfront.dk (Adam Sjøgren) writes:

> My results are:
>
>  Jesper Harder (spamwash.el+patch): 0.995423
>  Original spam-stat.el (ngnus-0.1): 0.995664
>  Andrew Cohen (patch)             : 0.997591
>
> Each time I installing the version to be tested, training and then ran
> spam-stat-test-directory on a spam-group with 4151 emails in it.

This is what I get with Andrew's latest version and the attached
version of spamwash[1]:

                  Spam                   Ham           Time
  -----------------------------------------------------------
  none        4900/5286 = 0.927    2/1641 = 0.0012     215 s
  spam-wash   5136/5286 = 0.972    2/1641 = 0.0012     486 s
  spamwash    5159/5286 = 0.976    3/1641 = 0.0018     394 s

The difference in detection rate between the two washers is probably
not large enough to be statistically significant.  And it's hardly
surprising that they're very close since they do nearly the same
thing.

The only major difference in output is that spamwash doesn't delete
the MIME headers:

   ------=_NextPart_1169_0527773208410
   Content-Type: text/html;
	  charset=iso-8859-1
   Content-Transfer-Encoding: Quoted-Printable

I don't know if it's the case, but some of that information might be
useful for the Bayesian filter.

Another possible advantage is that I think it's easier to customize
for users.  For example, if you wanted to wash HTML with Lynx before
analysis (to defeat poison words inserted as HTML comments) you could
write something like

    (defun spamwash-treat-html (cte ctl)
      (spamwash-decode-body cte ctl)
        (let ((func (cdr (assq 'lynx mm-text-html-washer-alist))))
            (apply (car func) (cdr func))))

and add ("text/html" . spamwash-treat-html) to
`spamwash-treatment-alist'.

An advantage with Andrew's code is that it's based on better tested
and debugged code.


[1] I stripped the "Xref" header before training (committed).
    Otherwise the prediction rate is too optimistic.

    I also did (define-coding-system-alias 'ks_c_5601-1987 'euc-kr),
    which helped both washers quite a bit.  ks_c_5601-1987 seems to be
    an alias or superset of euc-kr (does someone know?).  Maybe we
    should add it to `mm-charset-synonym-alist'.


[-- Attachment #2: spamwash.el --]
[-- Type: application/emacs-lisp, Size: 2843 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-22  7:30     ` Jesper Harder
@ 2004-01-22 13:49       ` Reiner Steib
  2004-01-23  1:15       ` Jesper Harder
  1 sibling, 0 replies; 11+ messages in thread
From: Reiner Steib @ 2004-01-22 13:49 UTC (permalink / raw)


On Thu, Jan 22 2004, Jesper Harder wrote:

> The only major difference in output is that spamwash doesn't delete
> the MIME headers:
[...]
> I don't know if it's the case, but some of that information might be
> useful for the Bayesian filter.
>
> Another possible advantage is that I think it's easier to customize
> for users.  For example, if you wanted to wash HTML with Lynx before
> analysis (to defeat poison words inserted as HTML comments) [...]

IIRC, the bogofilter people found that it's better _not_ to strip the
poison words and the html stuff.  (Matthias, is this correct?)

Bye, Reiner.
-- 
       ,,,
      (o o)
---ooO-(_)-Ooo--- PGP key available via WWW   http://rsteib.home.pages.de/




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: spam-stat.el and mime
  2004-01-22  7:30     ` Jesper Harder
  2004-01-22 13:49       ` Reiner Steib
@ 2004-01-23  1:15       ` Jesper Harder
  1 sibling, 0 replies; 11+ messages in thread
From: Jesper Harder @ 2004-01-23  1:15 UTC (permalink / raw)


Jesper Harder <harder@ifa.au.dk> writes:

>     I also did (define-coding-system-alias 'ks_c_5601-1987 'euc-kr),
>     which helped both washers quite a bit.  ks_c_5601-1987 seems to be
>     an alias or superset of euc-kr (does someone know?).

OK, as usual it's a stupid Microsoft mislabelling of one of their
crappy codepages (cp949), which is a superset of euc-kr.

>     Maybe we should add it to `mm-charset-synonym-alist'.

Done.




^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2004-01-23  1:15 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-01-10 16:43 spam-stat.el and mime Andrew Cohen
2004-01-11 20:37 ` Adam Sjøgren
2004-01-12  4:47 ` Jesper Harder
2004-01-12 21:37 ` Ted Zlatanov
2004-01-13 19:42 ` Adam Sjøgren
2004-01-20  5:56 ` Jesper Harder
2004-01-21  0:17   ` Ted Zlatanov
2004-01-21 20:41   ` Adam Sjøgren
2004-01-22  7:30     ` Jesper Harder
2004-01-22 13:49       ` Reiner Steib
2004-01-23  1:15       ` Jesper Harder

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).