;;; Word Jumble - Requires a Common Lisp compiler ;;; (C) 2005 Doug Hoyte and Hardcore Software ;;; ;;; Aoccdrnig to a rscheearch at Cmabrigde Uinervtisy, it deosn't ;;; mttaer in waht oredr the ltteers in a wrod are, the olny iprmoetnt ;;; tihng is taht the frist and lsat ltteer be at the rghit pclae. The ;;; rset can be a toatl mses and you can sitll raed it wouthit porbelm. ;;; Tihs is bcuseae the huamn mnid deos not raed ervey lteter by istlef, ;;; but the wrod as a wlohe. ;;; ;;; I, like many other people on the internet, read this and wondered ;;; why this paragraph of text was so easy to read. I wondered how ;;; many words it was possible to confuse with other words using this ;;; jumbling technique (like bolt/blot for instance). ;;; ;;; Here is the output with a standard dictionary shipped with my ;;; linux distribution: ;;; ;;; ;; Loading file jumble.lisp ... ;;; ;; Loaded file jumble.lisp ;;; T ;;; Number of words: 44814 ;;; Number of uniques: 44425 ;;; Percentage unique: 99.131966% ;;; ;;; NIL ;;; ;;; Apparently about 99% of words can't be confused by this jumbling ;;; technique. Very interesting! I think there are likely some ;;; psychological issues involved as well, however. ;;; ;;; The algorithm is quite simple. We make a 26x26 array. One dimension ;;; corresponds to the first letter, and the other to the last letter. ;;; All elements start off as the empty list. We then procede to add lists ;;; of characters to the list pointed to by the corresponding array elements ;;; for each word in the dictionary. We count the number of words for each ;;; first/last letter pair. Then we sort the middle letters of each word ;;; then sort the words themselves so it's easy to remove duplicates. We ;;; then remove the duplicates and recount the words. ;;; ;;; Usage: ;;; ;;; (word-jumble-stats fname) will print out a summary like the one ;;; above. ;;; ;;; (word-jumble-listing fname) will print out a list of words that ;;; can be confused. ;;; ;;; fname should be a dictionary file in both instances. The format is ;;; simply one dictionary word per line. ;;; ;;; (defvar orignumwords 0) (defvar newnumwords 0) (defvar wordhash (make-hash-table :test 'equalp)) (defvar origwords (make-array '(26 26))) (defvar words (make-array '(26 26))) (defvar origlens (make-array '(26 26) :initial-element 0)) (defvar newlens (make-array '(26 26) :initial-element 0)) (defvar havedups nil) ;;; ;;; Utility Functions ;;; (defun compile-if-needed (f) (if (not (compiled-function-p f)) (compile f))) (defun compile-all (funlist) (mapcar #'compile-if-needed funlist)) (defun to-ind (ch) (- (char-int ch) (char-int #\a))) (defun from-ind (ind) (coerce (+ ind (char-int #\a)) 'character)) (defun getgoodlist (str) (remove-if #'(lambda (x) (not (alpha-char-p x))) (coerce (string-downcase str) 'list))) ;;; ;;; Efficient Large-Scale Dictionary Processor ;;; ;;; Don't include duplicates or words <= 3 characters in length ;;; (defun meets-criteria (in inlist) (and (> (length inlist) 3) (not (gethash in wordhash)))) (defun add-to-words (inlist) (let ((firstind (to-ind (car inlist))) (lastind (to-ind (car (last inlist)))) (middle (cdr (butlast inlist 1)))) (setf (aref words firstind lastind) (cons middle (aref words firstind lastind))) (setf (aref origwords firstind lastind) (cons (copy-list middle) (aref origwords firstind lastind))))) (defun load-up-dict (fname) (with-open-file (s fname :direction :input) (loop (let* ((in (read-line s nil 'eof)) (inlist (getgoodlist in))) (if (equal in 'eof) (return)) (if (meets-criteria in inlist) (progn (add-to-words inlist) (setf (gethash in wordhash) t) (incf orignumwords))))))) (defun process-dict () (dotimes (i 26 t) (dotimes (j 26 t) (setf (aref origlens i j) (length (aref words i j))) (mapcar #'(lambda (x) (sort x #'char<)) (aref words i j)) (sort (aref words i j) #'(lambda (x y) (string< (coerce x 'string) (coerce y 'string)))) (setf (aref words i j) (delete-duplicates (aref words i j) :test #'equal)) (setf (aref newlens i j) (length (aref words i j))) (incf newnumwords (aref newlens i j)) (if (/= (aref origlens i j) (aref newlens i j)) (push (list i j) havedups))))) ;;; ;;; Inefficient Display Duplicate Code ;;; ;;; Must run the efficient processor first: load-up-dict and proccess-dict ;;; (defun are-same-middle (x y) (let ((xa (make-array 26 :initial-element 0)) (ya (make-array 26 :initial-element 0))) (mapcar #'(lambda (ch) (incf (aref xa (to-ind ch)))) x) (mapcar #'(lambda (ch) (incf (aref ya (to-ind ch)))) y) (equalp xa ya))) (defun show-dups-at (i j) (let* ((tplist (remove-duplicates (aref origwords i j) :test #'are-same-middle)) (non-common (remove-if #'(lambda (x) (member x tplist)) (aref origwords i j)))) (format t "~&~a~%" (aref origwords i j)) (format t "~&~a~%" tplist))) (defun display-middles (f l a b) (let ((as (coerce (concatenate 'list (list (from-ind f)) a (list (from-ind l))) 'string)) (bs (coerce (concatenate 'list (list (from-ind f)) b (list (from-ind l))) 'string))) (format t "~&~a/~a" as bs))) (defun show-dups-at (fchar lchar) (let ((l (aref origwords fchar lchar))) (dotimes (i (- (length l) 1) t) (do ((j (+ i 1) (+ j 1))) ((> j (length l)) t) (if (are-same-middle (nth i l) (nth j l)) (display-middles fchar lchar (nth i l) (nth j l))))))) (defun show-all-dups (all) (mapcar #'(lambda (x) (show-dups-at (car x) (cadr x))) all)) (compile-all '(to-ind from-ind getgoodlist)) (compile-all '(meets-criteria add-to-words load-up-dict process-dict)) (compile-all '(are-same-middle show-dups-at show-all-dups)) (defun word-jumble-stats (fname) (load-up-dict fname) (process-dict) (format t "~&Number of words: ~a" orignumwords) (format t "~&Number of uniques: ~a" newnumwords) (format t "~&Percentage unique: ~a%~%~%" (* 100 (coerce (/ newnumwords orignumwords) 'float)))) (defun word-jumble-listing (fname) (load-up-dict fname) (process-dict) (show-all-dups havedups))