static char rcsid[] = "HTMLurls.c,v 1.16 1996/01/08 09:08:20 duane Exp";
/* 
 *  HTMLurls - Prints all of the URLs from an HTML file.
 *
 *  Uses code from NCSA Mosaic (version 2.2) libhtmlw.
 *
 *  Usage: HTMLurls filename
 *
 *  Darren Hardy, hardy@cs.colorado.edu, January 1995
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "HTML.h"
#include "util.h"
#include "url.h"

/* Local Variables */
static Buffer *urls = NULL;
static char *base = NULL;

/* Global */
char *Url = NULL;

static void usage()
{
    fprintf(stderr, "Usage: HTMLurls [--base-url url] filename\n");
    exit(1);
}

/*
 *  strstr_icase - Looks for string b in string a.  Case insenstive cmps.
 */
char *strstr_icase(a, b)
     char *a, *b;
{
    int asz = strlen(a), bsz = strlen(b);
    static char *p;

    p = a;
    while (asz >= bsz) {
	if (!strncasecmp(p, b, bsz))
	    return (p);
	p++;
	asz--;
    }
    return (NULL);
}

/*
 *  process_anchor() - Extracts the URL from the anchor href tag.
 *
 *  Will process these anchors (HREF is case-insenstive):
 *
 *      <A HREF="url">
 *      <A HREF = "url">
 *      <A HREF = " url ">
 */
void process_anchor(s)
     char *s;
{
    char *p, *q, *tmps, *v;

    /* Find the HREF in the anchor */
    if ((tmps = strstr_icase(s, "href")) == NULL)
	return;

    /* Grab the URL from the HREF */
    if ((p = strchr(tmps, '=')) != NULL) {
	p++;			/* skip '=' */
	while (isspace(*p) || (*p == '\"'))
	    p++;		/* skip space '"'s */
	q = strdup(p);		/* copy URL */
	if ((p = strchr(q, '\"')) != NULL)	/* terminate string */
	    *p = '\0';
	if ((p = strchr(q, ' ')) != NULL)	/* terminate string */
	    *p = '\0';
	if (base != (char *) NULL) {
	    v = q;
	    q = url_parse_relative(v, base);
	    xfree(v);
	}
	if (q != (char *) NULL) {
	    add_buffer(urls, q, strlen(q));	/* Add URL to urls */
	    add_buffer(urls, "\n", 1);
	    xfree(q);
	}
	return;
    }
}

/*
 *  read_file() - Reads the file fp into memory and returns a pointer to it.
 */
Buffer *read_file(fp)
     FILE *fp;
{
    static Buffer *b;
    char buf[BUFSIZ];
    int nread;

    b = create_buffer(BUFSIZ);

    while ((nread = fread(buf, 1, BUFSIZ, fp)) > 0)
	add_buffer(b, buf, nread);

    return (b);
}

void process_base(buf)
     char *buf;
{
    char *t = NULL;
    char *p = NULL;
    char *q = NULL;

    /* Find the HREF in the anchor */
    if ((t = strstr_icase(buf, "href")) == (char *) NULL)
	return;

    /* Grab the URL from the HREF */
    if ((p = strchr(t, '=')) != NULL) {
	p++;			/* skip '=' */
	while (isspace(*p) || (*p == '\"'))
	    p++;		/* skip space '"'s */
	q = strdup(p);		/* copy URL */
	if ((p = strchr(q, '\"')) != NULL)	/* terminate string */
	    *p = '\0';
	if ((p = strchr(q, ' ')) != NULL)	/* terminate string */
	    *p = '\0';
    }
    if (q != (char *) NULL) {
	xfree(base);
	base = xstrdup(q);
    }
}

void process_node(mp)
     struct mark_up *mp;
{
    if ((mp->type == M_BASE) &&
	(mp->start != NULL) && (strlen(mp->start) > 5))
	process_base(mp->start);

    if ((mp->type == M_ANCHOR) &&
	(mp->start != NULL) && (strlen(mp->start) > 5))
	process_anchor(mp->start);
}


static void free_struct_markup(x)
     struct mark_up *x;
{
    if (x->text)
	free(x->text);
    if (x->start)
	free(x->start);
    if (x->end)
	free(x->end);
    free(x);
}

int main(argc, argv)
     int argc;
     char *argv[];
{
    struct mark_up *HTMLParse();
    struct mark_up *mp = NULL;
    struct mark_up *walker = NULL;
    struct mark_up *t = NULL;
    Buffer *b = NULL;
    FILE *fp = NULL;
    FILE *logfp = NULL;

    if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL)
	logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+");
    if (logfp == (FILE *) NULL)
	logfp = stderr;

    init_log3("HTMLurls", logfp, stderr);
    debug_init();
    for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {
	if (!strncmp(*argv, "-D", 2)) {
	    debug_flag(*argv);
	} else if (!strcmp(*argv, "--base-url")) {
	    argc--;
	    argv++;
	    if (argc < 1)
		usage();
	    base = xstrdup(*argv);
	}
    }

    if (argc < 1)
	usage();


    /* Parse the HTML file */
    if ((fp = fopen(*argv, "r")) == NULL) {
	log_errno(*argv);
	exit(1);
    }
    if (getenv("ENUMERATOR_URL"))
	Url = xstrdup(getenv("ENUMERATOR_URL"));
    if (Url == (char *) NULL)
	Url = xstrdup(*argv);

    b = read_file(fp);
    fclose(fp);
    mp = HTMLParse(NULL, b->data);
    free_buffer(b);

    urls = create_buffer(BUFSIZ);

    /* Extract important information from the parsed HTML */
    for (walker = mp; walker != NULL;
	t = walker, walker = walker->next, free_struct_markup(t))
	process_node(walker);

    fwrite(urls->data, 1, urls->length, stdout);
    free_buffer(urls);
    exit(0);
}
