;-*- coding: utf-8 -*- ;; emacs lisp. emacs 22. ;; started: 2008-01-03. ;; generate a report of wikipedia links. ;; this program traverse a given dir, visiting every html file, find links to Wikipedia in those files, collect them, and generate a nice html report of these links and the files they are from, then write it to a given file. ;; Xah Lee ;; ∑ http://xahlee.org/ ;;;; user level globle parameters (defconst dirpath (expand-file-name "../") "The dir to process.") (defconst root-path-char-count (length dirpath) "A integer that counts how many chars to take off of a given file's full path, to result as a relative path for the link url. e.g. if file path is “/Users/xah/web/emacs/emacs.html” , and root-path-char-count is 15, then its url in link would be “emacs/emacs.html”. This number is not necessarily the length of dirpath. It can be smaller for flexibility.") (defconst output-file (concat (expand-file-name "../") "wikipedia_links.html") "The file to save the generated report to. (existing file backedup as ~)") ;;;; loading package. global vars. (setq tmpBufName (concat " xahtemp" (int-to-string (random t)) )) (require 'find-lisp) ;; create hash table. ;; for each entry, the key is Wikipedia url, and value is a list of file paths. ;; like this: ("Wikipedia url" ("file1" "file2" ...)) (setq wpdata-hash (make-hash-table :test 'equal :size 4000)) ;; a list version of the hash for sorting & report (setq wpdata-list '()) ;; header text for the generated HTML file (setq header-text " Links To Wikipedia from XahLee.org

Links To Wikipedia from XahLee.org

") (setq footer-text "
2008-01
© 2008 by Xah Lee.
") ;;;; subroutines (defun insert-date () "Insert current date." (interactive) (if (and (or delete-selection-mode cua-mode) mark-active) (delete-region (region-beginning) (region-end)) ) (insert (format-time-string "%Y-%m-%d")) ) (defun hash-to-list (hashtable) "Return a list that represent the hashtable." (let (mylist) (maphash (lambda (kk vv) (setq mylist (cons (list kk vv) mylist))) hashtable) mylist)) (defun add-wplink-to-hash (filePath) "Get links in filePath and add it to hash table." (let (url) (insert-file-contents filePath nil nil nil t) (goto-char (point-min)) (while (re-search-forward "href=\"\\(http://..\\.wikipedia\\.org/[^\"]+\\)\">\\([^<]+\\)" nil t) (when (and (match-string 0) ; if url found (not (string-match "=" (match-string 1) )) ; not some history page ) (setq url (match-string 1)) ; set url to matched string ;; if exist in hash, prepend to existing entry, else just add (if (gethash url wpdata-hash) (puthash url (cons filePath (gethash url wpdata-hash)) wpdata-hash) (puthash url (list filePath) wpdata-hash)) )))) (defun prt-each (ele) "Print each item. ELE is of the form (url (filepath1 filepath2 ...)). Print it like this:
  • : , , ...
  • " (let (wplink files) (setq wplink (car ele)) (setq files (cadr ele)) (insert "
  • ") (insert (wikipedia-url-to-link wplink)) (insert " —") (dolist (x files nil) (insert (concat " " (get-html-file-title x) ","))) (delete-backward-char 1) (insert ".") (insert "
  • \n") ) ) (defun wikipedia-url-to-link (url) "Return the url as html link string.\n Example: http://en.wikipedia.org/wiki/Emacs becomes Emacs." (require 'gnus-util) (let ((linktext url)) (setq linktext (gnus-url-unhex-string linktext nil)) (setq linktext (concat (car (last (split-string linktext "/")))) ) (setq linktext (replace-regexp-in-string "&" "&" linktext)) (setq linktext (replace-regexp-in-string "_" " " linktext)) (concat "" linktext "" ) )) (defun get-html-file-title (fname) "Return FNAME tag's text. Assumes that the file contains the string “<title>...”." (let (x1 x2 linkText) (with-temp-buffer (goto-char (point-min)) (insert-file-contents fname nil nil nil t) (setq x1 (search-forward "")) (search-forward "") (setq x2 (search-backward "<")) (buffer-substring-no-properties x1 x2) ) )) ;;;; main ;; backup (when (file-exists-p output-file) (copy-file output-file (concat output-file "~") t) (delete-file output-file) ) ;; get links from files, put to hash (save-current-buffer (set-buffer (get-buffer-create tmpBufName)) (let (filePaths) ;; get files ending in “.html” but not starting with “xx”. (mapcar (lambda (x) (when (not (string-match "/xx" x)) (setq filePaths (cons x filePaths) ) )) (find-lisp-find-files dirpath "\\.html$")) (mapc 'add-wplink-to-hash filePaths) ) (setq wpdata-list (hash-to-list wpdata-hash)) (setq wpdata-list (sort wpdata-list (lambda (a b) (string< (downcase (car a)) (downcase (car b)))) ))) ;; print it out in a temp buffer and save to file (switch-to-buffer tmpBufName) (erase-buffer) (insert header-text) (insert "

    This page contains all existing links from XahLee.org to Wikipedia, as of ") (insert-date) (insert ". There are a total of " (number-to-string (length wpdata-list)) " links.

    \n\n") (insert "") (insert footer-text) (write-file output-file) (clrhash wpdata-hash) (setq wpdata-list '())