Gnus development mailing list
 help / color / mirror / Atom feed
From: Jesper Harder <harder@ifa.au.dk>
Subject: Re: spam-stat.el and mime
Date: Tue, 20 Jan 2004 06:56:17 +0100	[thread overview]
Message-ID: <m3llo3mada.fsf@defun.localdomain> (raw)
In-Reply-To: <87u133g3f4.fsf@andy.bu.edu> (Andrew Cohen's message of "Sat, 10 Jan 2004 11:43:27 -0500")

[-- Attachment #1: Type: text/plain, Size: 873 bytes --]

Andrew Cohen <cohen@andy.bu.edu> writes:

> Checking a bit this was almost entirely because it did no decoding
> of mime (or base64) encoded articles. I've modified it to decode
> mime (if you don't like this it can be controlled by customizing the
> spam-treat-mime-function to nil).

I looked a bit more at it:

+ (defun spam-treat-article ()
+   "Treat the current buffer prior to spam analysis."
+   (interactive)
+   (spam-decode)
    ^^^^^^^^^^^^^

It doesn't work to call `spam-decode' here -- you have to decode each
MIME part separately.

It's not so easy to use the existing MIME parsing functions in Gnus
for this purpose.  They were written with display in mind, and they're
also very slow because they do a lot of fancy stuff, which is
unnecessary in this context.

Please try the attached code (it increased my spam recognition rate by
4 percentage points).


[-- Attachment #2: spamwash.el --]
[-- Type: application/emacs-lisp, Size: 2340 bytes --]

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: Type: text/x-patch, Size: 4714 bytes --]

*** /home/harder/gnus/lisp/spam-stat.el	Mon Jan  5 20:12:20 2004
--- /home/harder/cvsgnus/lisp/spam-stat.el	Tue Jan 20 06:54:45 2004
***************
*** 122,127 ****
--- 122,128 ----
  \f
  
  ;;; Code:
+ (require 'spamwash)
  
  (defgroup spam-stat nil
    "Statistical spam detection for Emacs.
***************
*** 171,176 ****
--- 172,182 ----
    :type 'number
    :group 'spam-stat)
  
+ (defcustom spam-stat-washing-hook '(spamwash)
+   "Hook applied to each message before analysis."
+   :type 'hook
+   :group 'spam-stat)
+ 
  (defvar spam-stat-syntax-table
    (let ((table (copy-syntax-table text-mode-syntax-table)))
      (modify-syntax-entry ?- "w" table)
***************
*** 291,296 ****
--- 297,303 ----
  
  (defun spam-stat-buffer-words ()
    "Return a hash table of words and number of occurences in the buffer."
+   (run-hooks 'spam-stat-washing-hook)
    (with-spam-stat-max-buffer-size
     (with-syntax-table spam-stat-syntax-table
       (goto-char (point-min))
***************
*** 369,395 ****
    "Save the `spam-stat' hash table as lisp file."
    (interactive)
    (when (or force spam-stat-dirty)
!     (with-temp-buffer
!       (let ((standard-output (current-buffer))
! 	    (font-lock-maximum-size 0))
! 	(insert "(setq spam-stat-ngood "
! 		(number-to-string spam-stat-ngood)
! 		" spam-stat-nbad "
! 		(number-to-string spam-stat-nbad)
! 		" spam-stat (spam-stat-to-hash-table '(")
! 	(maphash (lambda (word entry)
! 		   (prin1 (list word
! 				(spam-stat-good entry)
! 				(spam-stat-bad entry))))
! 		 spam-stat)
! 	(insert ")))")
! 	(write-file spam-stat-file)))
      (setq spam-stat-dirty nil)))
  
  (defun spam-stat-load ()
    "Read the `spam-stat' hash table from disk."
    ;; TODO: maybe we should warn the user if spam-stat-dirty is t?
!   (load-file spam-stat-file)
    (setq spam-stat-dirty nil))
  
  (defun spam-stat-to-hash-table (entries)
--- 376,404 ----
    "Save the `spam-stat' hash table as lisp file."
    (interactive)
    (when (or force spam-stat-dirty)
!     (let ((coding-system-for-write 'emacs-mule))
!       (with-temp-file spam-stat-file
! 	  (let ((standard-output (current-buffer))
! 		(font-lock-maximum-size 0))
! 	    (insert ";-*- coding: emacs-mule; -*-\n")
! 	    (insert "(setq spam-stat-ngood "
! 		    (number-to-string spam-stat-ngood)
! 		    " spam-stat-nbad "
! 		    (number-to-string spam-stat-nbad)
! 		    " spam-stat (spam-stat-to-hash-table '(")
! 	    (maphash (lambda (word entry)
! 		       (prin1 (list word
! 				    (spam-stat-good entry)
! 				    (spam-stat-bad entry))))
! 		     spam-stat)
! 	    (insert ")))"))))
      (setq spam-stat-dirty nil)))
  
  (defun spam-stat-load ()
    "Read the `spam-stat' hash table from disk."
    ;; TODO: maybe we should warn the user if spam-stat-dirty is t?
!   (let ((coding-system-for-read 'emacs-mule))
!     (load-file spam-stat-file))
    (setq spam-stat-dirty nil))
  
  (defun spam-stat-to-hash-table (entries)
***************
*** 399,405 ****
  NBAD is the number of bad mails it has appeared in, GOOD is the number
  of times it appeared in good mails, and BAD is the number of times it
  has appeared in bad mails."
!   (let ((table (make-hash-table :test 'equal)))
      (mapc (lambda (l)
  	    (puthash (car l)
  		     (spam-stat-make-entry (nth 1 l) (nth 2 l))
--- 408,414 ----
  NBAD is the number of bad mails it has appeared in, GOOD is the number
  of times it appeared in good mails, and BAD is the number of times it
  has appeared in bad mails."
!   (let ((table (make-hash-table :size (length entries) :test 'equal)))
      (mapc (lambda (l)
  	    (puthash (car l)
  		     (spam-stat-make-entry (nth 1 l) (nth 2 l))
***************
*** 484,490 ****
                     (> (nth 7 (file-attributes f)) 0))
  	  (setq count (1+ count))
  	  (message "Reading %s: %.2f%%" dir (/ count max))
! 	  (insert-file-contents f)
  	  (funcall func)
  	  (erase-buffer))))))
  
--- 493,499 ----
                     (> (nth 7 (file-attributes f)) 0))
  	  (setq count (1+ count))
  	  (message "Reading %s: %.2f%%" dir (/ count max))
! 	  (insert-file-contents-literally f)
  	  (funcall func)
  	  (erase-buffer))))))
  
***************
*** 522,528 ****
  	  (setq count (1+ count))
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
! 	  (insert-file-contents f)
  	  (when (> (spam-stat-score-buffer) 0.9)
  	    (setq score (1+ score)))
  	  (erase-buffer))))
--- 531,537 ----
  	  (setq count (1+ count))
  	  (message "Reading %.2f%%, score %.2f%%"
  		   (/ count max) (/ score count))
! 	  (insert-file-contents-literally f)
  	  (when (> (spam-stat-score-buffer) 0.9)
  	    (setq score (1+ score)))
  	  (erase-buffer))))

  parent reply	other threads:[~2004-01-20  5:56 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-01-10 16:43 Andrew Cohen
2004-01-11 20:37 ` Adam Sjøgren
2004-01-12  4:47 ` Jesper Harder
2004-01-12 21:37 ` Ted Zlatanov
2004-01-13 19:42 ` Adam Sjøgren
2004-01-20  5:56 ` Jesper Harder [this message]
2004-01-21  0:17   ` Ted Zlatanov
2004-01-21 20:41   ` Adam Sjøgren
2004-01-22  7:30     ` Jesper Harder
2004-01-22 13:49       ` Reiner Steib
2004-01-23  1:15       ` Jesper Harder

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=m3llo3mada.fsf@defun.localdomain \
    --to=harder@ifa.au.dk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).