;;; Word Jumble - Requires a Common Lisp compiler
;;; (C) 2005 Doug Hoyte and Hardcore Software
;;;
;;; Aoccdrnig to a rscheearch at Cmabrigde Uinervtisy, it deosn't
;;; mttaer in waht oredr the ltteers in a wrod are, the olny iprmoetnt
;;; tihng is taht the frist and lsat ltteer be at the rghit pclae. The
;;; rset can be a toatl mses and you can sitll raed it wouthit porbelm.
;;; Tihs is bcuseae the huamn mnid deos not raed ervey lteter by istlef,
;;; but the wrod as a wlohe. 
;;;
;;; I, like many other people on the internet, read this and wondered
;;; why this paragraph of text was so easy to read. I wondered how
;;; many words it was possible to confuse with other words using this
;;; jumbling technique (like bolt/blot for instance).
;;;
;;; Here is the output with a standard dictionary shipped with my
;;; linux distribution:
;;;
;;; ;; Loading file jumble.lisp ...
;;; ;; Loaded file jumble.lisp
;;; T
;;; Number of words:    44814
;;; Number of uniques:  44425
;;; Percentage unique:  99.131966%
;;;
;;; NIL
;;;
;;; Apparently about 99% of words can't be confused by this jumbling
;;; technique. Very interesting! I think there are likely some
;;; psychological issues involved as well, however.
;;;
;;; The algorithm is quite simple. We make a 26x26 array. One dimension
;;; corresponds to the first letter, and the other to the last letter.
;;; All elements start off as the empty list. We then procede to add lists
;;; of characters to the list pointed to by the corresponding array elements
;;; for each word in the dictionary. We count the number of words for each
;;; first/last letter pair. Then we sort the middle letters of each word
;;; then sort the words themselves so it's easy to remove duplicates. We
;;; then remove the duplicates and recount the words.
;;;
;;; Usage:
;;;
;;; (word-jumble-stats fname) will print out a summary like the one
;;; above.
;;;
;;; (word-jumble-listing fname) will print out a list of words that
;;; can be confused.
;;;
;;; fname should be a dictionary file in both instances. The format is
;;; simply one dictionary word per line.
;;;
;;;


(defvar orignumwords 0)
(defvar newnumwords 0)
(defvar wordhash (make-hash-table :test 'equalp))
(defvar origwords (make-array '(26 26)))
(defvar words (make-array '(26 26)))
(defvar origlens (make-array '(26 26) :initial-element 0))
(defvar newlens (make-array '(26 26) :initial-element 0))
(defvar havedups nil)


;;;
;;; Utility Functions
;;;
(defun compile-if-needed (f)
    (if (not (compiled-function-p f)) (compile f)))

(defun compile-all (funlist)
  (mapcar #'compile-if-needed funlist))


(defun to-ind (ch)
  (- (char-int ch) (char-int #\a)))

(defun from-ind (ind)
  (coerce (+ ind (char-int #\a)) 'character))

(defun getgoodlist (str)
  (remove-if #'(lambda (x) (not (alpha-char-p x)))
             (coerce (string-downcase str) 'list)))


;;;
;;; Efficient Large-Scale Dictionary Processor
;;;
;;; Don't include duplicates or words <= 3 characters in length
;;;
(defun meets-criteria (in inlist)
  (and (> (length inlist) 3)
       (not (gethash in wordhash))))


(defun add-to-words (inlist)
  (let ((firstind (to-ind (car inlist)))
        (lastind (to-ind (car (last inlist))))
        (middle (cdr (butlast inlist 1))))
    (setf (aref words firstind lastind)
          (cons middle (aref words firstind lastind)))
    (setf (aref origwords firstind lastind)
          (cons (copy-list middle) (aref origwords firstind lastind)))))


(defun load-up-dict (fname)
  (with-open-file (s fname :direction :input)
    (loop
      (let* ((in (read-line s nil 'eof))
             (inlist (getgoodlist in)))
        (if (equal in 'eof) (return))
        (if (meets-criteria in inlist)
          (progn (add-to-words inlist)
                 (setf (gethash in wordhash) t)
                 (incf orignumwords)))))))


(defun process-dict ()
  (dotimes (i 26 t)
    (dotimes (j 26 t)
      (setf (aref origlens i j) (length (aref words i j)))

      (mapcar #'(lambda (x) (sort x #'char<)) (aref words i j))

      (sort (aref words i j) #'(lambda (x y)
                                 (string< (coerce x 'string)
                                          (coerce y 'string))))

      (setf (aref words i j)
        (delete-duplicates (aref words i j) :test #'equal))

      (setf (aref newlens i j) (length (aref words i j)))

      (incf newnumwords (aref newlens i j))

      (if (/= (aref origlens i j) (aref newlens i j))
        (push (list i j) havedups)))))


;;;
;;; Inefficient Display Duplicate Code
;;;
;;; Must run the efficient processor first: load-up-dict and proccess-dict
;;;
(defun are-same-middle (x y)
  (let ((xa (make-array 26 :initial-element 0))
        (ya (make-array 26 :initial-element 0)))
    (mapcar #'(lambda (ch) (incf (aref xa (to-ind ch)))) x)
    (mapcar #'(lambda (ch) (incf (aref ya (to-ind ch)))) y)
    (equalp xa ya)))


(defun show-dups-at (i j)
  (let* ((tplist (remove-duplicates (aref origwords i j) :test #'are-same-middle))
         (non-common (remove-if #'(lambda (x) (member x tplist)) (aref origwords i j))))
    (format t "~&~a~%" (aref origwords i j))
    (format t "~&~a~%" tplist)))


(defun display-middles (f l a b)
  (let ((as (coerce (concatenate 'list (list (from-ind f)) a (list (from-ind l))) 'string))
        (bs (coerce (concatenate 'list (list (from-ind f)) b (list (from-ind l))) 'string)))
    (format t "~&~a/~a" as bs)))


(defun show-dups-at (fchar lchar)
  (let ((l (aref origwords fchar lchar)))
    (dotimes (i (- (length l) 1) t)
      (do ((j (+ i 1) (+ j 1)))
          ((> j (length l)) t)
        (if (are-same-middle (nth i l) (nth j l))
          (display-middles fchar lchar (nth i l) (nth j l)))))))


(defun show-all-dups (all)
  (mapcar #'(lambda (x) (show-dups-at (car x) (cadr x))) all))


(compile-all '(to-ind from-ind getgoodlist))
(compile-all '(meets-criteria add-to-words load-up-dict process-dict))
(compile-all '(are-same-middle show-dups-at show-all-dups))


(defun word-jumble-stats (fname)
  (load-up-dict fname)
  (process-dict)
  (format t "~&Number of words:    ~a" orignumwords)
  (format t "~&Number of uniques:  ~a" newnumwords)
  (format t "~&Percentage unique:  ~a%~%~%" (* 100 (coerce (/ newnumwords orignumwords) 'float))))


(defun word-jumble-listing (fname)
  (load-up-dict fname)
  (process-dict)
  (show-all-dups havedups))