From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.io/gmane.emacs.gnus.general/55901 Path: main.gmane.org!not-for-mail From: Andrew Cohen Newsgroups: gmane.emacs.gnus.general Subject: spam-stat.el and mime Date: Sat, 10 Jan 2004 11:43:27 -0500 Sender: ding-owner@lists.math.uh.edu Message-ID: <87u133g3f4.fsf@andy.bu.edu> NNTP-Posting-Host: deer.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=-=-=" X-Trace: sea.gmane.org 1073780967 11653 80.91.224.253 (11 Jan 2004 00:29:27 GMT) X-Complaints-To: usenet@sea.gmane.org NNTP-Posting-Date: Sun, 11 Jan 2004 00:29:27 +0000 (UTC) Original-X-From: ding-owner+M4441@lists.math.uh.edu Sun Jan 11 01:29:21 2004 Return-path: Original-Received: from malifon.math.uh.edu ([129.7.128.13]) by deer.gmane.org with esmtp (Exim 3.35 #1 (Debian)) id 1AfTTk-000113-00 for ; Sun, 11 Jan 2004 01:29:20 +0100 Original-Received: from localhost ([127.0.0.1] helo=lists.math.uh.edu) by malifon.math.uh.edu with smtp (Exim 3.20 #1) id 1AfTTc-0002Kr-00; Sat, 10 Jan 2004 18:29:12 -0600 Original-Received: from justine.libertine.org ([66.139.78.221] ident=postfix) by malifon.math.uh.edu with esmtp (Exim 3.20 #1) id 1AfMCv-0008UH-00 for ding@lists.math.uh.edu; Sat, 10 Jan 2004 10:43:29 -0600 Original-Received: from quimby.gnus.org (quimby.gnus.org [80.91.224.244]) by justine.libertine.org (Postfix) with ESMTP id 84CC23A0064 for ; Sat, 10 Jan 2004 10:43:28 -0600 (CST) Original-Received: from news by quimby.gnus.org with local (Exim 3.35 #1 (Debian)) id 1AfMCt-00076I-00 for ; Sat, 10 Jan 2004 17:43:27 +0100 Original-To: ding@gnus.org Original-Path: not-for-mail Original-Newsgroups: gnus.ding Original-Lines: 249 Original-NNTP-Posting-Host: andy.bu.edu Original-X-Trace: quimby.gnus.org 1073753007 26545 128.197.41.152 (10 Jan 2004 16:43:27 GMT) Original-X-Complaints-To: usenet@quimby.gnus.org Original-NNTP-Posting-Date: Sat, 10 Jan 2004 16:43:27 +0000 (UTC) User-Agent: Gnus/5.110002 (No Gnus v0.2) Emacs/21.3.50 (gnu/linux) Cancel-Lock: sha1:Bn80xJ46QwcjEqdo9WdGhRKo/hw= Precedence: bulk Xref: main.gmane.org gmane.emacs.gnus.general:55901 X-Report-Spam: http://spam.gmane.org/gmane.emacs.gnus.general:55901 --=-=-= I've been using spam-stat.el for ages, but was unhappy that it only had a success rate of about 97%. Checking a bit this was almost entirely because it did no decoding of mime (or base64) encoded articles. I've modified it to decode mime (if you don't like this it can be controlled by customizing the spam-treat-mime-function to nil). After retraining, I now have a false-positive rate of less than .08% (no false positives on my test directory of 1300 ham emails) and a success rate of detecting spam of about 99.8%, which is as good or better than any of the other Bayesian filters I've played with. Diffs attached. --=-=-= Content-Disposition: inline; filename=spam-stat.el.diff Content-Description: spam-stat.el.diff *** spam-stat.el.orig Sat Jan 10 11:33:03 2004 --- spam-stat.el Sat Jan 10 11:33:25 2004 *************** *** 171,176 **** --- 171,182 ---- :type 'number :group 'spam-stat) + (defcustom spam-treat-mime-function 'spam-treat-mime + "Function to treat MIME articles." + :group 'spam-stat + :type 'function) + + (defvar spam-stat-syntax-table (let ((table (copy-syntax-table text-mode-syntax-table))) (modify-syntax-entry ?- "w" table) *************** *** 461,466 **** --- 467,473 ---- (progn (set-buffer spam-stat-buffer) (goto-char (point-min)) + (spam-treat-article) (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold) (when (boundp 'nnmail-split-trace) (mapc (lambda (entry) *************** *** 485,490 **** --- 492,498 ---- (setq count (1+ count)) (message "Reading %s: %.2f%%" dir (/ count max)) (insert-file-contents f) + (spam-treat-article) (funcall func) (erase-buffer)))))) *************** *** 503,519 **** (interactive) (hash-table-count spam-stat)) ! (defun spam-stat-test-directory (dir) "Test all the regular files in directory DIR for spam. If the result is 1.0, then all files are considered spam. ! If the result is 0.0, non of the files is considered spam. You can use this to determine error rates." (interactive "D") (let* ((files (directory-files dir t "^[^.]")) (total (length files)) (score 0.0); float (max (/ total 100.0)); float ! (count 0)) (with-temp-buffer (dolist (f files) (when (and (file-readable-p f) --- 511,531 ---- (interactive) (hash-table-count spam-stat)) ! (defun spam-stat-test-directory (dir &optional sp) "Test all the regular files in directory DIR for spam. If the result is 1.0, then all files are considered spam. ! If the result is 0.0, none of the files is considered spam. ! If SP eq spam, print the list of spam files. Otherwise if SP is non-nil ! print the non-spam files. You can use this to determine error rates." (interactive "D") (let* ((files (directory-files dir t "^[^.]")) (total (length files)) (score 0.0); float (max (/ total 100.0)); float ! (count 0) ! (spamf '()) ! (nspamf '())) (with-temp-buffer (dolist (f files) (when (and (file-readable-p f) *************** *** 523,531 **** (message "Reading %.2f%%, score %.2f%%" (/ count max) (/ score count)) (insert-file-contents f) (when (> (spam-stat-score-buffer) 0.9) ! (setq score (1+ score))) (erase-buffer)))) (message "Final score: %d / %d = %f" score total (/ score total)))) ;; Shrinking the dictionary --- 535,548 ---- (message "Reading %.2f%%, score %.2f%%" (/ count max) (/ score count)) (insert-file-contents f) + (push f nspamf) + (spam-treat-article) (when (> (spam-stat-score-buffer) 0.9) ! (setq score (1+ score)) ! (pop nspamf) ! (push f spamf)) (erase-buffer)))) + (when sp (print (if (equal sp "spam") spamf nspamf))) (message "Final score: %d / %d = %f" score total (/ score total)))) ;; Shrinking the dictionary *************** *** 562,567 **** --- 579,685 ---- (remove-hook 'gnus-select-article-hook 'spam-stat-store-gnus-article-buffer)) + (defun spam-treat-article () + "Treat the current buffer prior to spam analysis." + (interactive) + (spam-decode) + (let ((gnus-inhibit-treatment t) + (gnus-treatment-function-alist nil) + (gnus-article-image-alist nil) + (gnus-article-wash-types nil) + (gnus-article-buffer (current-buffer)) + (buffer-read-only nil)) + (when spam-treat-mime-function + (funcall spam-treat-mime-function)))) + + (defun spam-treat-mime (&optional ihandles) + "Treat MIME parts." + (save-excursion + (save-selected-window + (let ((window (get-buffer-window gnus-article-buffer)) + (point (point))) + (when window + (select-window window) + ;; We have to do this since selecting the window + ;; may change the point. So we set the window point. + (set-window-point window point))) + (let* ((handles (or ihandles + (mm-dissect-buffer nil gnus-article-loose-mime) + (and gnus-article-emulate-mime + (mm-uu-dissect)))) + buffer-read-only handle name type b e display) + (when (and (not ihandles) + (not gnus-displaying-mime)) + ;; Top-level call; we clean up. + (when gnus-article-mime-handles + (mm-destroy-parts gnus-article-mime-handles) + (setq gnus-article-mime-handle-alist nil));; A trick. + (setq gnus-article-mime-handles handles) + ;; We allow users to glean info from the handles. + (when gnus-article-mime-part-function + (gnus-mime-part-function handles))) + (if (and handles + (or (not (stringp (car handles))) + (cdr handles))) + (progn + (when (and (not ihandles) + (not gnus-displaying-mime)) + ;; Clean up for mime parts. + (article-goto-body) + (delete-region (point) (point-max))) + (let ((gnus-displaying-mime t)) + (spam-treat-parts handles)))))))) + + (defun spam-treat-parts (handle) + (if (stringp (car handle)) + (mapcar 'spam-treat-parts (cdr handle)) + (if (bufferp (car handle)) + (save-restriction + (narrow-to-region (point) (point)) + (when (let ((case-fold-search t)) + (string-match "text" (car (mm-handle-type handle)))) + (mm-insert-part handle)) + (goto-char (point-max))) + (mapcar 'spam-treat-parts handle)))) + + + (defun spam-decode () + "Translate a quoted-printable-encoded or base64 article." + (interactive) + (save-excursion + (let ((buffer-read-only nil) type charset) + (if (gnus-buffer-live-p (current-buffer)) + (with-current-buffer (current-buffer) + (setq type + (gnus-fetch-field "content-transfer-encoding")) + (let* ((ct (gnus-fetch-field "content-type")) + (ctl (and ct + (ignore-errors + (mail-header-parse-content-type ct))))) + (setq charset (and ctl + (mail-content-type-get ctl 'charset))) + (if (stringp charset) + (setq charset (intern (downcase charset))))))) + (unless charset + (setq charset gnus-newsgroup-charset)) + (if (and type (let ((case-fold-search t)) + (string-match "quoted-printable" type))) + (progn + (article-goto-body) + (quoted-printable-decode-region + (point) (point-max) (mm-charset-to-coding-system charset))) + (progn + (when (and type (let ((case-fold-search t)) + (string-match "base64" type))) + (article-goto-body) + (save-restriction + (narrow-to-region (point) (point-max)) + (ignore-errors (base64-decode-region (point-min) (point-max)) + (mm-decode-coding-region + (point-min) (point-max) + (mm-charset-to-coding-system charset)))))))))) + + (provide 'spam-stat) ;;; spam-stat.el ends here --=-=-= Regards, Andy --=-=-=--