/*		Parse HyperText Document Address		HTParse.c
**		================================
*/

#include"capalloc.h"
#include"capstdio.h"
#include "HTUtils.h"
#include "HTParse.h"
#include "tcp.h"

#define HEX_ESCAPE '%'

struct struct_parts {
	char * access;
	char * host;
	char * absolute;
	char * relative;
/*	char * search;		no - treated as part of path */
	char * anchor;
};


/*	Strip white space off a string
**	------------------------------
**
** On exit,
**	Return value points to first non-white character, or to 0 if none.
**	All trailing white space is OVERWRITTEN with zero.
*/

#ifdef __STDC__
char * HTStrip(char * s)
#else
char * HTStrip(s)
	char *s;
#endif
{
#define SPACE(c) ((c==' ')||(c=='\t')||(c=='\n')) 
    char * p=s;
    for(p=s;*p;p++);		        /* Find end of string */
    for(p--;p>=s;p--) {
    	if(SPACE(*p)) *p=0;	/* Zap trailing blanks */
	else break;
    }
    while(SPACE(*s))s++;	/* Strip leading blanks */
    return s;
}


static void scan(char *cp_name, struct struct_parts *SSPp_parts)	{
/*
 *	Purpose:	Break up an address name into its separate parts.
 *	Arguments:	cp_name		An address (URL) to break up.
 *					The name may be incomplete.
 *			SSBp_parts	The structure to store the different
 *					parts in.
 *	Return Value:	void
 *	Remarks/Portability/Dependencies/Restrictions:
 *		The following refers to the members of the passed in
 *		structure upon return of this function:
 *			The absolute xor relative are NULL.
 *			host, anchor, and access may be nonzero if they
 *				were found in the address.
 *			Any nonzero members point to ASCIIZ strings.
 *	Revision History:
 *		??-??-??	created
 *		03-28-94	modified for DosLynx
 */
	auto char *cp_after_access;
	auto char *cp_p;
	auto signed short int ssi_length = strlen(cp_name);

	/*
	 *	Initialize all parts of the upcoming parts of the name.
	 */
	SSPp_parts->access = SSPp_parts->host = SSPp_parts->absolute =
		SSPp_parts->relative = SSPp_parts->anchor = NULL;

	/*
	 *	Save a pointer to the start of the original name.
	 *	This is a reference to where the access of the address
	 *	has ended (i.e. http, ftp, etc...)
	 *	There may be no access specified.
	 */
	cp_after_access = cp_name;

	/*
	 *	Loop through the address to set the access name and track
	 *	where it ends.
	 */
	for(cp_p = cp_name; *cp_p != '\0'; cp_p++)	{
		/*
		 *	We loop for a colon which always follows the access
		 *	name.
		 *	(Except in the case of a specified port number???)
		 */
		switch(*cp_p)	{
		/*
		 *	Need to break the loop on the following
		 *	special characters that make up a URL.
		 */
		case '/':
		case '#':
		case '.':
			break;
		case ':':
			/*
			 *	End the string here.  No need to keep :
			 *	Set the access part of the structure.
			 *	Set to where the string continues after the
			 *	access string.
			 */
			*cp_p = '\0';
			SSPp_parts->access = cp_name;
			cp_after_access = cp_p + 1;
			break;
		default:
			continue;
		}

		/*
		 *	If code gets here, need to break loop.
		 */
		break;
	}

	/*
	 *	Loop backwards through the address looking for the tag
	 *	anchor to first select upon loading.
	 */
	for(cp_p = cp_name + ssi_length - 1; cp_p >= cp_name; cp_p--)	{
		/*
		 *	Found the tag anchor, terminate the address before
		 *	the #, the tag anchor should be the rest of the
		 *	address to the end of the address.
		 */
		if(*cp_p == '#')	{
			SSPp_parts->anchor = cp_p + 1;
			*cp_p = '\0';
		}
	}

	/*
	 *	Start back up directly after the specified access type.
	 */
	cp_p = cp_after_access;

	/*
	 *	If there we have a /, a host or root should follow.
	 */
	if(*cp_p == '/')	{
		/*
		 *	If following the /, we have another /, there is a
		 *	host following.
		 */
		if(*(cp_p + 1) == '/')	{
			/*
			 *	Set the address pointing to the host.
			 */
			SSPp_parts->host = cp_p + 2;

			/*
			 *	Attempt to find the end of the host's name
			 *	beginning, of course, with a path /.
			 */
			cp_p = strchr(SSPp_parts->host, '/');

			/*
			 *	A path (root) was found, set the absolute
			 *	path with it.
			 *	Be sure to terminate the host name.
			 */
			if(cp_p != NULL)	{
				*cp_p = '\0';
				SSPp_parts->absolute = cp_p + 1;
			}
		}
		else	{
			/*
			 *	There was no host specified, must use what
			 *	follows as the absolute path (root).
			 */
			SSPp_parts->absolute = cp_p + 1;
		}
	}
	else	{
		/*
		 *	There is no host or root (path) specification in
		 *	the address so it must be relative.
		 *	Be careful not to assign an unNULL string if there
		 *	is actually nothing inside of it.
		 */
		SSPp_parts->relative = (*cp_after_access) ?
			cp_after_access : NULL;
	}

	/*
	 *	If there was an access type and an anchor specification
	 *	but no host, this is an exception.
	 *	We must restore the tag anchor symbol # to the address
	 *	and set that there is actually no anchor.
	 *	In these cases, the anchor is not really an anchor at all.
	 *	e.g. news:j462#36487@foo.bar
	 */
	if(SSPp_parts->access != NULL && SSPp_parts->host == NULL &&
		SSPp_parts->anchor != NULL)	{
		*(SSPp_parts->anchor - 1) = '#';
		SSPp_parts->anchor = NULL;
	}

	/*
	 *	All done with the scan.
	 */
}


extern char *HTParse(const char *cp_aName, const char *cp_relatedName,
	signed short int ssi_wanted)	{
/*
 *	Purpose:	Parse an address (URL) name relative to another
 *				(URL) name.
 *	Arguments:	cp_aName	The address to parse.
 *			cp_relatedName	The relative address to parse
 *					cp_aName with.
 *			ssi_wanted	A mask for the bits which are
 *					flags on how to parse the address.
 *	Return Value:	char *	A malloced string which is the resulting
 *				address requested according to the flags
 *				which were set and the original and
 *				and relative name.
 *	Remarks/Portability/Dependencies/Restrictions:
 *		All calling functions should free the memory allocated by
 *		HTParse once finished with the return value.
 *	Revision History:
 *		??-??-??	created
 *		03-28-94	modified for DosLynx
 */

	auto char *cp_result;
	auto char *cp_return_value = NULL;
	auto signed short int ssi_len;
	auto char *cp_name = NULL;
	auto char *cp_rel = NULL;
	auto char *cp_p;
	auto char *cp_access;
	auto struct struct_parts SSP_given, SSP_related;

	/*
	 *	Copy the input strings so that we can split them up into
	 *	their parts.
	 */
	ssi_len = strlen(cp_aName) + strlen(cp_relatedName) + 10;
	/*
	 *	Allocate space; more than enough.
	 */
	cp_result = (char *)malloc(ssi_len);
	/*
	 *	Report error on not enough memory to allocate.
	 */
	if(cp_result == NULL)	{
		outofmem(__FILE__, "HTParse");
	}

	/*
	 *	Copy over the two names, allocating memory while doing so.
	 *	Question:  How does StrAllocCopy change the pointer value?
	 *	Answer:  Must be a macro, or this is a bug.
	 *	It's a macro in HTString, calls HTSACopy
	 */
	StrAllocCopy(cp_name, cp_aName);
	StrAllocCopy(cp_rel, cp_relatedName);

	/*
	 *	Break the allocated names up into their respective parts.
	 */
	scan(cp_name, &SSP_given);
	scan(cp_rel,  &SSP_related);

	/*
	 *	Begin building the requested address.
	 */
	*cp_result = '\0';

	cp_access = SSP_given.access ? SSP_given.access :
		SSP_related.access;

	/*
	 *	Requesting the access, if there is any, be sent back.
	 */
	if(ssi_wanted & PARSE_ACCESS)	{
		if(cp_access)	{
			strcat(cp_result, cp_access);
			/*
			 *	Requestor also wants full URL style
			 *	return.  Put in a : to separate access.
			 */
			if(ssi_wanted & PARSE_PUNCTUATION)	{
				strcat(cp_result, ":");
			}
		}
	}

	/*
	 *	If the access is not specified on either the given address
	 *	or related address and then also if they are not the same,
	 *	then disregard all related information.
	 */
	if (SSP_given.access && SSP_related.access)	{
		if(strcmp(SSP_given.access, SSP_related.access) != 0)	{
			SSP_related.host =
			SSP_related.absolute =
			SSP_related.relative =
			SSP_related.anchor = NULL;
		}
	}

	/*
	 *	If requesting the host in the return.
	 */
	if(ssi_wanted & PARSE_HOST)	{
		/*
		 *	If either the given or related address has a host
		 */
		if(SSP_given.host != NULL || SSP_related.host != NULL)	{
			/*
			 *	Figure where to add the host.
			 */
			auto char *cp_tail = cp_result + strlen(cp_result);
			/*
			 *	If exact URL punctuation requested, add the
			 *	leading //
			 */
			if(ssi_wanted & PARSE_PUNCTUATION)	{
				strcat(cp_result, "//");
			}
			/*
			 *	Append the host
			 */
			strcat(cp_result, SSP_given.host != NULL ?
				SSP_given.host : SSP_related.host);

			/*
			 *	We must ignore default port numbers and
			 *	trailing dots on FQDNs(?) which will cause
			 *	identical addresses to look different.
			 */
			{
				/*
				 *	Find a : in the host.
				 */
				auto char *cp = strchr(cp_tail, ':');
				/*
				 *	If a port was specified.
				 */
				if(cp != NULL && cp_access != NULL)	{
					/*
					 *	Check for redundant access
					 *	types and port numbers.
					 */
					if((strcmp(cp_access, "http") == 0
						&& strcmp(cp, ":80") == 0)
						|| (strcmp(cp_access,
						"gopher") == 0 && strcmp(cp,
						":70") == 0))	{
						/*
						 *	Redundant, end the
						 *	return address
						 *	before the port is
						 *	specified.
						 */
						*cp = '\0';
					}
				}
				/*
				 *	No redundant port specified.
				 */
				else if(cp == NULL)	{
					/*
					 *	Set to end of hostname.
					 */
					cp = cp_tail + strlen(cp_tail);
				}

				/*
				 *	Back up one since beyond actual
				 *	end of the hostname.
				 */
				cp--;

				/*
				 *	If there is a period at the end of
				 *	the hostname, kill it.
				 */
				if(*cp == '.')	{
					*cp = '\0';
				}
			}
		}
	}

	/*
	 *	If there are different hosts, no relative path will be
	 *	assumed.
	 */
	if(SSP_given.host != NULL && SSP_related.host != NULL)	{
		if(strcmp(SSP_given.host, SSP_related.host) != 0)	{
			SSP_related.absolute =
			SSP_related.relative =
			SSP_related.anchor = NULL;
		}
	}

	/*
	 *	If the path is also part of the requested return.
	 */
	if(ssi_wanted & PARSE_PATH)	{
		/*
		 *	If the absolute (full) path is already given.
		 */
		if(SSP_given.absolute != NULL)	{
			/*
			 *	Requesting the full URL punctuation
			 */
			if(ssi_wanted & PARSE_PUNCTUATION)	{
				strcat(cp_result, "/");
			}
			/*
			 *	Append the absolute path.
			 */
			strcat(cp_result, SSP_given.absolute);
		}
		/*
		 *	Otherwise, we must adopt the given path but not
		 *	the file name.
		 */
		else if(SSP_related.absolute != NULL)	{
			/*
			 *	Append the leading /
			 *	Shouldn't we check for PARSE_PUNCTUATION?
			 *	Doing so, possible error.
			 *	Append the relative absolute path.
			 */
			if(ssi_wanted & PARSE_PUNCTUATION)	{
				strcat(cp_result, "/");
			}
			strcat(cp_result, SSP_related.absolute);

			/*
			 *	Check to see if we have a relative path
			 *	to further evaluate and append.
			 */
			if(SSP_given.relative != NULL)	{
				/*
				 *	See if there is a search directive
				 *	in the address, if so avoid it.
				 *	If there isn't set to the end of
				 *	the address.
				 */
				cp_p = strchr(cp_result, '?');
				if(cp_p == NULL)	{
					cp_p = cp_result + strlen(cp_result)
						- 1;
				}

				/*
				 *	Find the last / by backing up and
				 *	finding it.
				 */
				for(; *cp_p != '/'; cp_p--)
					/* NULL body */;

				/*
				 *	Remove the file name from the
				 *	address and add the given relative
				 *	path and file.
				 */
				*(cp_p + 1) = '\0';
				strcat(cp_result, SSP_given.relative);

				/*
				 *	Simplyfy the resulting address by
                                 *	taking out .. and . stuff
				 */
				HTSimplify(cp_result);
			}
		}
		/*
		 *	Otherwise we use what we have got.
		 */
		else if(SSP_given.relative != NULL)	{
			strcat(cp_result, SSP_given.relative);
		}
		else if(SSP_related.relative != NULL)	{
			strcat(cp_result, SSP_related.relative);
		}
		else {
			/*
			 *	No inheritance at all.
			 */
			strcat(cp_result, "/");
		}
	}

	/*
	 *	If the anchor is requested also.
	 */
	if(ssi_wanted & PARSE_ANCHOR)	{
		if(SSP_given.anchor != NULL || SSP_related.anchor != NULL)
		{
			/*
			 *	Keep punctuation if requested.
			 */
			if(ssi_wanted & PARSE_PUNCTUATION)	{
				strcat(cp_result, "#");
			}
			strcat(cp_result, SSP_given.anchor != NULL ?
				SSP_given.anchor : SSP_related.anchor);
		}
	}

	/*
	 *	Free up the copied anchors.
	 *	This also frees the memory pointed to by our SSP_* structs
	 */
	free(cp_rel);
	free(cp_name);

	/*
	 *	Allocate a new string that will be the correct length.
	 */
	StrAllocCopy(cp_return_value, cp_result);
	free(cp_result);
	return(cp_return_value);
}


/*	        Simplify a filename
//		-------------------
//
// A unix-style file is allowed to contain the seqeunce xxx/../ which may be
// replaced by "" , and the seqeunce "/./" which may be replaced by "/".
// Simplification helps us recognize duplicate filenames.
//
//	Thus, 	/etc/junk/../fred 	becomes	/etc/fred
//		/etc/junk/./fred	becomes	/etc/junk/fred
//
//      but we should NOT change
//		http://fred.xxx.edu/../..
//
//	or	../../albert.html
*/
#ifdef __STDC__
void HTSimplify(char * filename)
#else
void HTSimplify(filename)
    char * filename;
#endif

{
    char * p;
    char * q;
    if (filename[0] && filename[1])	/* Bug fix 12 Mar 93 TBL */
     for(p=filename+2; *p; p++) {
        if (*p=='/') {
	    if ((p[1]=='.') && (p[2]=='.') && (p[3]=='/' || !p[3] )) {
		for (q=p-1; (q>=filename) && (*q!='/'); q--); /* prev slash */
		if (q[0]=='/' && 0!=strncmp(q, "/../", 4)
			&&!(q-1>filename && q[-1]=='/')) {
	            strcpy(q, p+3);	/* Remove  /xxx/..	*/
		    if (!*filename) strcpy(filename, "/");
		    p = q-1;		/* Start again with prev slash 	*/
		} else {			/*   xxx/.. leave it!	*/
#ifdef BUG_CODE
		    strcpy(filename, p[3] ? p+4 : p+3); /* rm  xxx/../	*/
		    p = filename;		/* Start again */
#endif
		}
	    } else if ((p[1]=='.') && (p[2]=='/' || !p[2])) {
	        strcpy(p, p+2);			/* Remove a slash and a dot */
	    }
	}
    }
}


/*		Make Relative Name
**		------------------
**
** This function creates and returns a string which gives an expression of
** one address as related to another. Where there is no relation, an absolute
** address is retured.
**
**  On entry,
**	Both names must be absolute, fully qualified names of nodes
**	(no anchor bits)
**
**  On exit,
**	The return result points to a newly allocated name which, if
**	parsed by HTParse relative to relatedName, will yield aName.
**	The caller is responsible for freeing the resulting name later.
**
*/
#ifdef __STDC__
char * HTRelative(const char * aName, const char *relatedName)
#else
char * HTRelative(aName, relatedName)
   char * aName;
   char * relatedName;
#endif
{
    char * result = 0;
    CONST char *p = aName;
    CONST char *q = relatedName;
    CONST char * after_access = 0;
    CONST char * path = 0;
    CONST char * last_slash = 0;
    int slashes = 0;
    
    for(;*p; p++, q++) {	/* Find extent of match */
    	if (*p!=*q) break;
	if (*p==':') after_access = p+1;
	if (*p=='/') {
	    last_slash = p;
	    slashes++;
	    if (slashes==3) path=p;
	}
    }
    
    /* q, p point to the first non-matching character or zero */
    
    if (!after_access) {			/* Different access */
        StrAllocCopy(result, aName);
    } else if (slashes<3){			/* Different nodes */
    	StrAllocCopy(result, after_access);
    } else if (slashes==3){			/* Same node, different path */
        StrAllocCopy(result, path);
    } else {					/* Some path in common */
        int levels= 0;
        for(; *q && (*q!='#'); q++)  if (*q=='/') levels++;
	result = (char *)malloc(3*levels + strlen(last_slash) + 1);
      if (result == NULL) outofmem(__FILE__, "HTRelative");
	result[0]=0;
	for(;levels; levels--)strcat(result, "../");
	strcat(result, last_slash+1);
    }
#ifndef RELEASE
    if (TRACE) fprintf(stderr, "HT: `%s' expressed relative to\n    `%s' is\n   `%s'.",
		aName, relatedName, result);
#endif /* RELEASE */
    return result;
}


/*		Escape undesirable characters using %		HTEscape()
**		-------------------------------------
**
**	This function takes a pointer to a string in which
**	some characters may be unacceptable unescaped.
**	It returns a string which has these characters
**	represented by a '%' character followed by two hex digits.
**
**	Unlike HTUnEscape(), this routine returns a malloced string.
*/

PRIVATE CONST unsigned char isAcceptable[96] =

/*	Bit 0		xalpha		-- see HTFile.h
**	Bit 1		xpalpha		-- as xalpha but with plus.
**	Bit 3 ...	path		-- as xpalphas but with /
*/
    /*   0 1 2 3 4 5 6 7 8 9 A B C D E F */
    {    0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4,	/* 2x   !"#$%&'()*+,-./	 */
         7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0,	/* 3x  0123456789:;<=>?	 */
	 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,	/* 4x  @ABCDEFGHIJKLMNO  */
	 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7,	/* 5X  PQRSTUVWXYZ[\]^_	 */
	 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,	/* 6x  `abcdefghijklmno	 */
	 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 };	/* 7X  pqrstuvwxyz{\}~	DEL */

PRIVATE char *hex = "0123456789ABCDEF";

PUBLIC char * HTEscape ARGS2 (CONST char *, str,
	unsigned char, mask)
{
#define ACCEPTABLE(a)	( a>=32 && a<128 && ((isAcceptable[a-32]) & mask))
    CONST char * p;
    char * q;
    char * result;
    int unacceptable = 0;
    for(p=str; *p; p++)
        if (!ACCEPTABLE((unsigned char)TOASCII(*p)))
		unacceptable++;
    result = (char *) malloc(p-str + unacceptable+ unacceptable + 1);
    if (result == NULL) outofmem(__FILE__, "HTEscape");
    for(q=result, p=str; *p; p++) {
    	unsigned char a = TOASCII(*p);
	if (!ACCEPTABLE(a)) {
	    *q++ = HEX_ESCAPE;	/* Means hex commming */
	    *q++ = hex[a >> 4];
	    *q++ = hex[a & 15];
	}
	else *q++ = *p;
    }
    *q++ = 0;			/* Terminate */
    return result;
}


/*		Decode %xx escaped characters			HTUnEscape()
**		-----------------------------
**
**	This function takes a pointer to a string in which some
**	characters may have been encoded in %xy form, where xy is
**	the acsii hex code for character 16x+y.
**	The string is converted in place, as it will never grow.
*/

PRIVATE char from_hex ARGS1(char, c)
{
    return  c >= '0' && c <= '9' ?  c - '0' 
    	    : c >= 'A' && c <= 'F'? c - 'A' + 10
    	    : c - 'a' + 10;	/* accept small letters just in case */
}

PUBLIC char * HTUnEscape ARGS1( char *, str)
{
    char * p = str;
    char * q = str;
    while(*p) {
        if (*p == HEX_ESCAPE) {
	    p++;
	    if (*p) *q = from_hex(*p++) * 16;
	    if (*p) *q = FROMASCII(*q + from_hex(*p++));
	    q++;
	} else {
	    *q++ = *p++; 
	}
    }
    
    *q++ = 0;
    return str;
    
} /* HTUnEscape */


