;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Copyright (c) 1993 by William M. Perry (wmperry@indiana.edu)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Functions to parse out a url and replace it in the buffer
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(require 'w3)

(defun w3-match (s x)
  "Return regexp match x in s"
  (substring s (match-beginning x) (match-end x)))

(defun w3-build-links-list ()
  "Build links out of url specs in the temporary buffer"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (if buffer-read-only (toggle-read-only))
  (let* ((next-id (int-to-string (1+ (w3-find-highest-link-num))))
	 (cur-id "")
	 (cur-url "")
	 (cur-txt "")
	 (cur-urn "")
	 (cur-rel "")
	 (cur-href "")
	 (cur-rev "")
	 (cur-title "")
	 (cur-meth "")
	 )
    (goto-char (point-min))
    (while (re-search-forward w3-link-begin-regexp nil t)
      (let* ((start (match-beginning 0))
	     (cur-url (prog1
			  (buffer-substring (match-beginning 0)
					    (match-end 0))
			(replace-match "")))
	     (end   (if (re-search-forward w3-link-end-regexp nil t)
			(prog1
			    (match-beginning 0)
			  (replace-match ""))
		      (progn
			(end-of-line)
			(point)))))
	(if (string-match "NAME *= *\"*\\([^ >]*\\)\"*" cur-url)
	    (setq cur-id (w3-match cur-url 1))
	  (setq cur-id next-id
		next-id (int-to-string (1+ (string-to-int next-id)))))
	(if (string-match "HREF *= *\"*[ \\\t]*\\([^>\"]*\\)\"*" cur-url)
	    (setq cur-href (w3-match cur-url 1))
	  (setq cur-href nil))
	(if (string-match "[^H]REF *= *\\([^ ]*\\)" cur-url)
	    (setq cur-rel (w3-match cur-url 1)))
	(if (string-match "REV *= *\\([^ ]*\\)" cur-url)
	    (setq cur-rev (w3-match cur-url 1))
	  (setq cur-rev nil))
	(if (string-match "URN *= *\\([^ ]*\\)" cur-url)
	    (setq cur-rev (w3-match cur-url 1))
	  (setq cur-rev nil))
	(if (string-match "TITLE *= *\"\\([^\"]*\\)\"" cur-url)
	    (setq cur-title (w3-match cur-url 1))
	  (setq cur-title nil))
	(if (string-match "METHODS *= *\\([^ ]*\\)" cur-url)
	    (setq cur-meth (w3-match cur-url 1))
	  (setq cur-meth nil))
	(setq cur-txt (buffer-substring start end))
	(if (and
	     cur-href
	     (not
	      (string-match "^\\(news\\|ftp\\|http\\|file\\|telnet\\|gopher\\):"
			    cur-href)))
	    (let ((resolved (cond ((equal w3-current-type "http")
				   (concat "http://" w3-current-server ":"
					   w3-current-port))
				  ((equal w3-current-type "ftp")
				   (concat "file://" w3-current-server "/"))
				  (t "file:"))))
	      (setq cur-href (w3-remove-relative-links cur-href))
	      (setq cur-href
		    (cond
		     ((equal (string-to-char cur-href) ?/)
		      (concat resolved cur-href))
		     ((equal (string-to-char cur-href) ?#) cur-href)
		     (t (concat
			 resolved
			 (w3-basepath w3-current-file) "/" cur-href))))))
	(cond
	 ((eq w3-delimit-links 'linkname)
	  (goto-char end)
	  (insert (format "[%s]" cur-id)))
	 ((not (null w3-delimit-links))
	  (goto-char start)
	  (insert w3-link-start-delimiter)
	  (goto-char (+ end (length w3-link-start-delimiter)))
	  (insert w3-link-end-delimiter)
	  (setq end (+ end (length w3-link-start-delimiter)
		       (length w3-link-end-delimiter)))))
	(if cur-href
	    (progn
	      (w3-add-zone start end w3-node-style
			   (cons 'w3
				 (list cur-id cur-href cur-txt
				       cur-urn cur-rel cur-rev
				       cur-meth cur-title)))
	      (if (and w3-running-lemacs
		       (extent-at (1+ start)))
		  (set-extent-attribute (extent-at (1+ start)) 'highlight)))
	  (w3-add-zone start end w3-default-style
		       (cons 'w3 (list cur-id cur-href cur-txt
				       cur-urn cur-rel cur-rev
				       cur-meth cur-title))))))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Functions to handle formatting an html buffer
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defun w3-prepare-buffer ()
  "Function to prepare w3-buffer for processing."
  (message "Parsing...")
  (set-buffer " *W3*")
  (let ((annos (w3-fetch-personal-annotations)))
    (if annos
	(progn
	  (goto-char (point-max))
	  (insert
	   "<P>--------------\n<H1>Personal Annotations</H1><P><UL>")
	  (while annos
	    (goto-char (point-max))
	    (insert "\n<LI> " (car annos))
	    (setq annos (cdr annos)))
	  (insert "</UL>"))))
  (goto-char (point-min))
  (run-hooks 'w3-file-prepare-hooks)
  (if (and w3-running-lemacs
	   (face-differs-from-default-p w3-default-style))
      (set-extent-face (make-extent (point-min) (point-max)) w3-default-style))
  (setq fill-column (- (screen-width) w3-right-border))
  (if w3-running-epoch (setq fill-pixel (- (window-pixwidth) 10)))
  (if (and w3-use-telnet (equal w3-current-type "http"))
      (progn
	(goto-char (point-min))
	(kill-line 1)
	(w3-replace-regexp "Connection.*" "")))
  (let ((case-fold-search t)
	(ttl ""))
    (if (not (re-search-forward "<PLAINTEXT>" nil t))
	(progn
	  (goto-char (point-min))
	  (w3-replace-regexp (char-to-string 13) "");; Kill CTRLM's
	  (w3-replace-regexp (char-to-string 12) "");; Kill CTRLL's
	  (w3-replace-regexp "<\\(TBL[^>]*\\)>" "<\\1>\n<PRE>")
	  (w3-replace-regexp "</TBL>" "</TBL>\n</PRE>")
	  (w3-replace-regexp "<PRE[^>]*>" "<PRE>")
	  (goto-char (point-min))
	  (w3-balance-pre)
	  (w3-balance-xmp)
	  (message "Parsing...")
	  (w3-handle-arbitrary-tags)
	  (w3-ignore)
	  (w3-check-index)
	  (w3-kill-obsolete)
	  (w3-fix-xmp)
	  (w3-fix-pre)
	  (w3-handle-forms)
	  (w3-fix-extras)
	  (w3-fix-nonames)
	  (w3-handle-comments)
	  (goto-char (point-min))
	  (message "Parsing...")
	  (w3-handle-whitespace)
	  (w3-replace-regexp "<DL COMPACT>" "<DL>")
	  (w3-do-lists)
	  (w3-replace-regexp "</*[UD]L\\( COMPACT\\)*>" "\n")
	  (w3-replace-regexp "</*DIR>" "\n")
	  (w3-replace-regexp "</*MENU>" "\n")
	  (w3-replace-regexp "<LI>" "\n\t*")
	  (w3-replace-regexp "<DT>" "\n<DT>")
	  (w3-replace-regexp "<DD>" "\n\t*")
	  (w3-handle-headers)
	  (message "Parsing...")
	  (w3-handle-address)
	  (message "Parsing...")
	  (w3-handle-graphics)
	  (w3-restore-pre)
	  (w3-build-links-list)
	  (w3-handle-tables)
	  (goto-char (point-min))
	  (let ((st (if (re-search-forward "<TITLE>" nil t) (point) nil))
		(nd (if (re-search-forward "</TITLE>" nil t) (point) nil)))
	    (if st
		(progn
		  (setq ttl (buffer-substring st (- nd 8)))
		  (delete-region st nd))
	      (setq ttl (w3-basepath w3-current-file t)))
	    (setq ttl (buffer-name (generate-new-buffer ttl))))
	  (w3-fix-paragraphs)
	  (w3-replace-regexp "<[^>]*>" "")
	  (w3-fix-ampersands)
	  (w3-restore-xmp)
	  (goto-char (point-min))
	  (kill-buffer ttl)
	  (rename-buffer ttl)
	  (if w3-mutable-windows
	      (pop-to-buffer ttl)
	    (switch-to-buffer ttl))
	  (if (not buffer-read-only) (toggle-read-only))
	  (message "Done.")
	  (w3-mode)
	  (goto-char (point-min))
	  (if w3-find-this-link
	      (w3-find-specific-link w3-find-this-link))
	  (run-hooks 'w3-file-done-hooks)
	  (if w3-keep-history
	      (let ((url (w3-view-url t)))
		(if (and (not (assoc url w3-history-list))
			 (not (equal url "file:historylist")))
		    (setq w3-history-list
			  (cons (cons url ttl) w3-history-list)))))
	  (message "Done.")
	  (if w3-running-epoch
	      (setq buffer-style w3-default-style))
	  (if w3-running-FSF19
	      (setq w3-zones-list (w3-only-links)))
	  (sit-for 0))
      (progn
	(replace-match "")
	(w3-replace-regexp (char-to-string 13) "");; Kill CTRLM's
	(w3-replace-regexp (char-to-string 12) "");; Kill CTRLL's
	(goto-char (point-min))
	(let ((tmp (buffer-name (generate-new-buffer w3-current-file))))
	  (kill-buffer tmp)
	  (rename-buffer tmp)
	  (if (not buffer-read-only) (toggle-read-only))
	  (if w3-mutable-windows
	      (pop-to-buffer tmp)
	    (switch-to-buffer tmp))
	  (w3-mode)
	  (run-hooks 'w3-file-done-hooks)
	  (message "Done.")
	  (if w3-running-epoch
	      (setq buffer-style w3-default-style))
	  (sit-for 0))))))

(defun w3-handle-arbitrary-tags ()
  "Find occurences of <!ENTITY ...> and replace them correctly"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (while (re-search-forward
	  "<!ENTITY[ \\\t]+\\([^ ]*\\)[ \\\t]+\"\\([^\"]*\\)\">" nil t)
    (let ((entity (buffer-substring (match-beginning 1) (match-end 1)))
	  (defn   (buffer-substring (match-beginning 2) (match-end 2))))
      (replace-match "")
      (w3-replace-regexp (regexp-quote (format "&%s;" entity)) defn))))

(defun w3-balance-xmp ()
  (set-buffer " *W3*")
  (goto-char (point-min))
  (let* ((st (w3-count-occurences "<XMP>"))
	 (nd (w3-count-occurences "</XMP>"))
	 (df (- st nd)))
    (goto-char (point-max))
    (while (> df 0)
      (setq df (1- df))
      (insert "</XMP>\n"))))

(defun w3-balance-pre ()
  (set-buffer " *W3*")
  (goto-char (point-min))
  (let* ((st (w3-count-occurences "<PRE>"))
	 (nd (w3-count-occurences "</PRE>"))
	 (df (- st nd)))
    (goto-char (point-max))
    (while (> df 0)
      (setq df (1- df))
      (insert "</PRE>\n"))))

(defun w3-fix-extras ()
  "Remove extra formatting commands that have no meaning on a dumb terminal"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (while (re-search-forward
	  "<\\(TT\\|B\\|I\\|U\\|EM\\|STRONG\\|CODE\\|SAMP\\|KBD\\|VAR\\|DFN\\|CITE\\)>"
	  nil t)
    (let* ((st (match-beginning 0))
	   (dastyle (upcase (buffer-substring (match-beginning 1)
					      (match-end 1))))
	   (nd (progn
		 (replace-match "")
		 (if (re-search-forward (concat "</" dastyle ">") nil t)
		     (prog1
			 (match-beginning 0)
		       (replace-match ""))
		   (point))))
	   (sty (w3-lookup-style dastyle)))
      (w3-add-zone st nd sty (cons 'w3 'style)))))

(defun w3-fix-ampersands ()
  "Replace &#XXX with ASCII character XXX"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (w3-replace-regexp "&lt;*" "<")
  (w3-replace-regexp "&gt;*" ">")
  (let ((case-fold-search nil))
    (w3-replace-regexp "&auml;*" "\344")
    (w3-replace-regexp "&Auml;*" "\304")
    (w3-replace-regexp "&uuml;*" "\374")
    (w3-replace-regexp "&Uuml;*" "\334")
    (w3-replace-regexp "&ouml;*" "\366")
    (w3-replace-regexp "&Ouml;*" "\326"))
  (goto-char (point-min))
  (while (re-search-forward "&#\\([0-9]+\\);*" nil t)
    (replace-match (char-to-string 
		    (string-to-int (buffer-substring (match-beginning 1)
						     (match-end 1))))))
  (goto-char (point-min))
  (w3-replace-regexp "&endash;" "--")
  (w3-replace-regexp "&emdash;" "---")
  (w3-replace-regexp "&amp;" "&"))

(defun w3-fix-pre ()
  "Extract <PRE> fields, and put them back in later."
  (set-buffer " *W3*")
  (goto-char (point-min))
  (setq w3-pre-data nil
	w3-pre-data-count 0)
  (while (re-search-forward "<PRE>" nil t)
    (let* ((start (match-beginning 0))
	   (end (progn (re-search-forward "</PRE>" nil t)
		       (point))))
      (setq w3-pre-data-count (1+ w3-pre-data-count)
	    w3-pre-data (cons (list w3-pre-data-count
				    (buffer-substring start end)) w3-pre-data))
      (delete-region start end)
      (goto-char start)
      (insert "***PREDATA" (int-to-string w3-pre-data-count)))))

(defun w3-restore-pre (&optional done)
  "Restore the <PRE> fields"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (while (> w3-pre-data-count 0)
    (re-search-forward (concat "***PREDATA" (int-to-string w3-pre-data-count))
		       nil t)
    (replace-match (concat (if (not done) "<PRE>" "\n")
			   (substring
			    (car (cdr (assoc w3-pre-data-count w3-pre-data)))
			    5 -6)
			   (if (not done) "</PRE>" "\n")) t t)
    (goto-char (point-min))
    (setq w3-pre-data-count (1- w3-pre-data-count))))

(defun w3-fix-xmp ()
  "Extract <XMP> fields, and put them back in later."
  (set-buffer " *W3*")
  (goto-char (point-min))
  (setq w3-xmp-data nil
	w3-xmp-data-count 0)
  (while (re-search-forward "<XMP>" nil t)
    (let* ((start (match-beginning 0))
	   (end (progn (re-search-forward "</XMP>" nil t)
		       (point))))
      (setq w3-xmp-data-count (1+ w3-xmp-data-count)
	    w3-xmp-data (cons (list w3-xmp-data-count
				    (buffer-substring start end)) w3-xmp-data))
      (delete-region start end)
      (goto-char start)
      (insert "***XMPDATA" (int-to-string w3-xmp-data-count)))))

(defun w3-restore-xmp ()
  "Restore the <XMP> fields"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (while (> w3-xmp-data-count 0)
    (goto-char (point-min))
    (re-search-forward (concat "***XMPDATA" (int-to-string w3-xmp-data-count))
		       nil t)
    (replace-match (concat "\n"
			   (substring
			    (car (cdr (assoc w3-xmp-data-count w3-xmp-data)))
			    5 -6) "\n") t t)
    (setq w3-xmp-data-count (1- w3-xmp-data-count))))

(defun w3-fix-nonames ()
  "Replace links with no name fields with bogus #s"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (let ((bogus-num (1+ (w3-find-highest-link-num))))
    (while (re-search-forward "<A[ \t\n]+HREF" nil t)
      (replace-match (concat "<A NAME=" (int-to-string bogus-num) " HREF"))
      (setq bogus-num (1+ bogus-num)))))

(defun w3-check-index ()
  "Check to see if this is an indexed file.  If it is, update the mode line"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (if (re-search-forward "<ISINDEX>" nil t)
      (progn
	(setq w3-current-isindex t)
	(replace-match ""))
    (setq w3-current-isindex nil)))

(defun w3-ignore ()
  "Ignore certain fields - (NEXTID, etc)"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (w3-replace-regexp "<NEXTID *[A-Z]* *=* *\"*[a-zA-z0-9]*\"*>" ""))

(defun w3-kill-obsolete ()
  "Delete old/obsolete html headers/references"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (w3-replace-regexp "</*HEAD\\(ER\\)*>" "")
  (w3-replace-regexp "</*BODY>" "")
  (w3-replace-regexp "<LISTING>" "<PRE>")
  (w3-replace-regexp "</LISTING>" "</PRE>"))

(defun w3-handle-whitespace ()
  "Fix newlines, tabs, and spaces"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (w3-replace-regexp "<P>\\\n*" "<P>")
  (w3-replace-regexp "\\\n" " ")
  (w3-replace-regexp "\t" " ")
  (w3-replace-regexp "  +" " ")
  (w3-replace-regexp "\\\. +" ".  "))

(defun w3-handle-comments ()
  "Replace comments with blanks."
  (set-buffer " *W3*")
  (w3-replace-regexp "<!--[^>]*-->" ""))

(defun w3-handle-headers ()
  "Do the headers"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (while (re-search-forward "<H\\([0-9]+\\)>" nil t)
    (let* ((siz (buffer-substring (match-beginning 1) (match-end 1)))
	   (st (prog1
		   (match-beginning 0)
		 (replace-match "\n")))
	   (end (progn
		  (if (re-search-forward (format "</H%s>" siz) nil t)
		      (prog1
			  (match-beginning 0)
			(replace-match "\n"))))))
      (w3-add-zone st end w3-header-style (cons 'w3 'header)))))

(defun w3-fix-paragraphs-in-region ()
  "Fill paragraphs in the visible part of the buffer"
  (w3-replace-regexp "<P>" "\n\n\n")
  (w3-replace-regexp "^ +" "")
  (w3-replace-regexp "  +" " ")
  (goto-char (point-min))
  (while (re-search-forward "^[^\\\n\\\t]" nil t)
    (fill-individual-paragraphs (progn (beginning-of-line) (point))
				(progn (end-of-line) (point))))
  (let ((x 1))
    (while (<= x 5)
      (goto-char (point-min))
      (let ((fill-prefix (concat (make-string x 9) "  ")))
	(while (re-search-forward (format "^%s\\\*"
					  (make-string x 9)) nil t)
	  (fill-region-as-paragraph (progn (beginning-of-line) (point))
				    (progn (end-of-line) (point))))
	(setq fill-prefix (concat (make-string x 9) "     "))
	(while (re-search-forward (format "^%s *[0-9]+"
					  (make-string x 9)) nil t)
	  (fill-region-as-paragraph (progn (beginning-of-line) (point))
				    (progn (end-of-line) (point)))))
      (setq x (1+ x))))
  (w3-replace-regexp "\\\n\\\n\\\t" "\n\t")
  (w3-replace-regexp "\\\n\\\n" "\n"))

(defun w3-fix-paragraphs (&optional pt recur)
  "Fix filling of paragraphs in a new buffer"
  (set-buffer " *W3*")
  (if (re-search-forward "<\\(PRE\\|XMP\\)>" nil t)
      (let ((st (if pt pt (point-min)))
	    (nd (- (point) 5))
	    (tp (buffer-substring (match-beginning 1) (match-end 1))))
	(save-restriction
	  (narrow-to-region st nd)
	  (w3-fix-paragraphs-in-region))
	(re-search-forward (format "</%s>" tp) nil t)
	(w3-fix-paragraphs (point) t))
    (narrow-to-region (point) (point-max))
    (w3-fix-paragraphs-in-region)
    (widen)))

(defun w3-handle-address ()
  "Handle the <ADDRESS> field"
  (set-buffer " *W3*")
  (goto-char (point-min))
  (while (re-search-forward "\\\n*<ADDRESS>\\\n*" nil t)
    (let* ((st (prog1
		   (match-beginning 0)
		 (replace-match "\n")))
	   (nd (progn
		 (if (re-search-forward "\\\n*</ADDRESS>\\\n*" nil t)
		     (prog1
			 (match-beginning 0)
		       (replace-match "\n"))
		   (point)))))
      (w3-add-zone st nd w3-address-style (cons 'w3 'address)))))

(defun w3-nuke-unsupported ()
  "Kill unsupported links/formatting"
  (set-buffer " *W3*")
  (w3-replace-regexp "<IMG[ \\\t]+[^Ss]*SRC[ \\\t]*=[ \\\t]*\"\\([^>]+\\)\">"
		     "&lt;<A HREF=\"\\1\">IMAGE</A>&gt;"))

(defun w3-handle-graphics ()
  (set-buffer " *W3*")
  (goto-char (point-min))
  (if (and w3-running-epoch (fboundp 'add-graphic-zone))
      (while (re-search-forward "<IMG +SRC=\"\\([^\"]+\\)\" *>" nil t)
	(let ((img (buffer-substring (match-beginning 1) (match-end 1)))
	      (st (match-beginning 0))
	      (nd (match-end 0)))
	  (replace-match "^")
	  (if (not (equal (string-to-char img) 47))
	      (setq img (concat (w3-basepath w3-current-file) "/" img)))
	  (w3-insert-graphic img st)))
    (w3-nuke-unsupported)))

(provide 'w3-parse)
