static char rcsid[] = "gopherenum-depth-first.c,v 1.1 1996/01/08 09:08:24 duane Exp";
/*
 *  gopherenum.c - RootNode URL enumerator for Gopher URLs
 *
 *  Usage: gopherenum gopher-URL
 *
 *  Outputs the following format:
 *
 *      URL of tree root
 *      URL <tab> md5
 *      ...
 *      URL <tab> md5
 *
 *  DEBUG: section  43, level 1         Gatherer enumeration for Gopher
 *
 *  Darren Hardy, hardy@cs.colorado.edu, June 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <string.h>
#include <signal.h>
#include <stdlib.h>
#include <gdbm.h>
#include <GNUregex.h>
#include "util.h"
#include "url.h"
#define PUBLIC extern
#include "filter.h"

typedef struct _list_t {
    void *ptr;
    struct _list_t *next;
} list_t;


/* Global variables */
int max_depth = 0;
int start_depth = 0;

/* Local variables */
static int url_max = 0;
static int nurls = 0;
static int host_max = 0;
static int nhosts = 0;
static char *tree_root = NULL;
static char *urldb_filename = NULL;
static char *hostdb_filename = NULL;
static char *md5db_filename = NULL;
static GDBM_FILE urldbf = NULL;
static GDBM_FILE hostdbf = NULL;
static GDBM_FILE md5dbf = NULL;

/* Local functions */
static void process_url();
static void usage();
static void mark_retrieved();
static void sigdie();
static int url_in_db();
static int md5_in_db();

/* ---------------------------------------------------------------------- */

/*
 *  mark_retrieved() - Mark that the given URL was successfully retrieved,
 *  so that the URL is not retrieved again.  This prevents cycles in the
 *  enumeration.
 */
static void mark_retrieved(up)
     URL *up;
{
    datum k, d;

    k.dptr = xstrdup(up->url);
    k.dsize = strlen(k.dptr) + 1;
    d.dptr = xstrdup(up->md5);
    d.dsize = strlen(d.dptr) + 1;

    if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))
	fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));
    if (!gdbm_exists(md5dbf, d) && gdbm_store(md5dbf, d, k, GDBM_INSERT))
	fatal("GDBM MD5DB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));

    /* Print URL to stdout to enumerate; flush to keep pipe moving */
    fprintf(stdout, "%s\t%s\n", up->url, up->md5);	/* URL <tab> MD5 */
    fflush(stdout);

    if (++nurls >= url_max) {
	Log("Truncating RootNode %s at %d LeafNode URLs\n",
	    tree_root, url_max);
	sigdie();
    }
    xfree(k.dptr);		/* Clean up */
    xfree(d.dptr);
}

/*
 *  url_in_db() - check to see if the URL is in the database
 */
static int url_in_db(url)
     char *url;
{
    datum k;
    int r;

    k.dptr = xstrdup(url);
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(urldbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  md5_in_db() - check to see if the MD5 is in the database
 */
static int md5_in_db(md5)
     char *md5;
{
    datum k;
    int r;

    k.dptr = xstrdup(md5);
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(md5dbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  host_in_db() - check to see if the host is in the database
 */
static int host_in_db(host)
     char *host;
{
    datum k;
    int r;
    Host *h;

    h = get_host(host);
    if (h == (Host *) NULL)
	return 0;
    k.dptr = xstrdup(h->dotaddr);
    k.dsize = strlen(k.dptr) + 1;
    r = gdbm_exists(hostdbf, k);
    xfree(k.dptr);
    return (r);
}

/*
 *  visit_server() - Determine if we should visit the server.  Return
 *  zero if we should not process the URL; otherwise, return non-zero.
 */
static int visit_server(up)
     URL *up;
{
    datum k, d;
    Host *h;

    if (host_in_db(up->host))	/* Host is already in the db */
	return (1);
    if (++nhosts > host_max)
	return (0);

    h = get_host(up->host);
    if (h == (Host *) NULL)
	return (0);
    k.dptr = xstrdup(h->dotaddr);
    k.dsize = strlen(k.dptr) + 1;
    d.dptr = xstrdup(up->url);
    d.dsize = strlen(d.dptr) + 1;

    if (gdbm_store(hostdbf, k, d, GDBM_INSERT))
	fatal("GDBM HOSTDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));
    xfree(k.dptr);
    xfree(d.dptr);
    return (1);
}

/*
 *  gopher_enum() - Returns all of the URLs.  The buffer that is returned 
 *  has the URLs separated by \n's.  Returns NULL on error.
 */
static list_t *gopher_enum(up)
     URL *up;
{
    char buf[BUFSIZ];
    char newurl[BUFSIZ];
    list_t *head = NULL;
    list_t **Tail = NULL;
    list_t *l = NULL;
    FILE *fp = NULL;
    char *p = NULL;
    char *q = NULL;
    char *urlbuf = NULL;
    char *gopher_name = NULL;
    char *gopher_path = NULL;
    char *gopher_host = NULL;
    char *gopher_port = NULL;
    int y;

    if (url_in_db(up->url)) {	/* Have we been here? */
	Debug(43, 1, ("Already Visited URL: %s\n", up->url));
	return (NULL);
    }
    if ((y = filter_selection(up))) {	/* Match the URL based on REs */
	Debug(43, 1, ("Removing Candidate: [%s] %s\n",
		Filter_Type_Name[y], up->url));
	return (NULL);
    }
    if (!visit_server(up)) {	/* Can we visit this server? */
	Debug(43, 1, ("Disallowed to Visit Server: %s\n", up->url));
	return (NULL);
    }
    if (url_retrieve(up)) {	/* Grab the URL; success? */
	Debug(43, 1, ("Cannot Retrieve URL: %s\n", up->url));
	return (NULL);
    }
    if (up->md5 && md5_in_db(up->md5)) {	/* Have we been here? */
	Debug(43, 1, ("Already Visited MD5: %s\n", up->url));
	return (NULL);
    }
    /* Remember that we've been here before */
    if (up->md5 != NULL)
	mark_retrieved(up);
    if (up->gophertype == 0)
	return (NULL);

    /* 
     *  For each pointer, convert it to a URL, and add it to
     *  the list of URLs to return.
     */
    if ((fp = fopen(up->filename, "r")) == NULL) {
	log_errno2(__FILE__, __LINE__, up->filename);
	return (NULL);
    }
    Tail = &head;
    while (fgets(buf, BUFSIZ, fp)) {
	if (buf[0] == '.' || buf[0] == '\n')
	    break;
	urlbuf = xstrdup(buf);
	if ((q = strrchr(buf, '\n')))
	    *q = (char) '\0';

	p = urlbuf;
	if ((q = strchr(p, '\t')) == NULL) {
	    errorlog("Illegal Gopher format: No Name: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_name = xstrdup(p);

	p = q + 1;
	if ((q = strchr(p, '\t')) == NULL) {
	    errorlog("Illegal Gopher format: No Path: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_path = xstrdup(rfc1738_escape(p));

	p = q + 1;
	if ((q = strchr(p, '\t')) == NULL) {
	    errorlog("Illegal Gopher format: No Host: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_host = xstrdup(p);

	p = q + 1;
	if ((q = strchr(p, '\n')) == NULL) {
	    errorlog("Illegal Gopher format: No Port: %s\n", buf);
	    goto gopher_enum_cont;
	}
	*q = (char) '\0';
	gopher_port = xstrdup(p);

	/* Fix for wierd cross-site Gopher links - wessels */
	if (!strncasecmp(gopher_path, "ftp%3a", 6))
	    goto gopher_enum_cont;
	if (!strncasecmp(gopher_path, "ftp:", 4))
	    goto gopher_enum_cont;
	if (!strncasecmp(gopher_path, "exec%3a", 7))
	    goto gopher_enum_cont;
	if (!strncasecmp(gopher_path, "exec:", 5))
	    goto gopher_enum_cont;

	sprintf(newurl, "gopher://%s:%d/%c%s\n", gopher_host,
	    atoi(gopher_port), gopher_name[0], gopher_path);
	l = (list_t *) xmalloc(sizeof(list_t));
	l->ptr = (void *) xstrdup(newurl);
	l->next = (list_t *) NULL;
	*Tail = l;
	Tail = &(l->next);

      gopher_enum_cont:
	if (gopher_name)
	    xfree(gopher_name);
	gopher_name = NULL;
	if (gopher_path)
	    xfree(gopher_path);
	gopher_path = NULL;
	if (gopher_host)
	    xfree(gopher_host);
	gopher_host = NULL;
	if (gopher_port)
	    xfree(gopher_port);
	gopher_port = NULL;
	if (urlbuf)
	    xfree(urlbuf);
	gopher_name = NULL;
    }
    fclose(fp);
    return (head);
}

/*
 *  process_url() - Retrieves the given URL, computes an MD5,
 *  and extracts the list of menu pointers within the documents. 
 */
static void process_url(up, depth)
     URL *up;
     int depth;
{
    list_t *head = 0;
    list_t *l = 0;
    list_t *next_l = 0;
    char *url;
    URL *tup;

    if (max_depth > 0 && depth > max_depth) {
	Debug(43, 1, ("Maximum Depth of %d Reached: %s\n",
		max_depth, up->url));
	url_close(up);
	return;
    }
    Debug(43, 1, ("Processing: [%2d] %s\n", depth, up->url));

    if ((head = gopher_enum(up)) == NULL) {
	url_close(up);
	return;
    }
    /*
     *  Now, for each URL in the list, call process_url() if
     *  the URL is a Gopher url and it is on the same host
     */
    for (l = head; l; l = next_l) {
	next_l = l->next;
	url = (char *) l->ptr;
	if (url == (char *) NULL)
	    goto free_list_entry;
	if ((tup = url_open(url)) == NULL)
	    goto free_list_entry;
	if ((tup->type != URL_GOPHER)) {
	    url_close(tup);
	    goto free_list_entry;
	}
	if (tup->gophertype >= 2) {	/* ignore everything 2 or higher */
	    url_close(tup);
	    goto free_list_entry;
	}
	process_url(tup, depth + 1);	/* should be a 1 - menu */

      free_list_entry:
	xfree(l->ptr);
	xfree(l);
	url_close(tup);
    }
}

/* ---------------------------------------------------------------------- */

/*
 *  initialize() - Basic init routines
 */
static void initialize()
{
    char *s;
    FILE *logfp = NULL;

#ifdef USE_HOST_CACHE
    host_cache_init();
#endif

    max_depth = url_max = host_max = 0;
    if ((s = getenv("HARVEST_URL_MAX")) != NULL)
	url_max = atoi(s);
    if ((s = getenv("HARVEST_HOST_MAX")) != NULL)
	host_max = atoi(s);
    if ((s = getenv("HARVEST_DEPTH_MAX")) != NULL)
	max_depth = atoi(s);
    if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL)
	start_depth = atoi(s);
    Debug(43, 9, ("HARVEST_DEPTH_CUR=%d\n", s ? s : "NULL"));
    if (url_max < 1)
	url_max = 250;		/* hard-coded maximum */
    if (host_max < 1)
	host_max = 1;		/* hard-coded maximum */
    if (max_depth < 1)
	max_depth = 0;		/* hard-coded maximum */
    host_filterfile = getenv("HARVEST_HOST_FILTER");
    url_filterfile = getenv("HARVEST_URL_FILTER");

    if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL)
	logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+");
    if (logfp == (FILE *) NULL)
	logfp = stderr;
    init_log3("gopherenum", logfp, stderr);
    init_url();
    filter_initialize();

    /* Open GDBM databases to keep track of where we've been */
    urldb_filename = xstrdup(tempnam(NULL, "Gurl"));
    urldbf = gdbm_open(urldb_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (urldbf == NULL) {
	log_errno(urldb_filename);
	fatal("gdbm_open: %s: %s", urldb_filename,
	    gdbm_strerror(gdbm_errno));
    }
    hostdb_filename = xstrdup(tempnam(NULL, "Ghost"));
    hostdbf = gdbm_open(hostdb_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (hostdbf == NULL) {
	log_errno(hostdb_filename);
	fatal("gdbm_open: %s: %s", hostdb_filename,
	    gdbm_strerror(gdbm_errno));
    }
    md5db_filename = xstrdup(tempnam(NULL, "Gmd5"));
    md5dbf = gdbm_open(md5db_filename, 0, GDBM_NEWDB, 0644, NULL);
    if (md5dbf == NULL) {
	log_errno(md5db_filename);
	fatal("gdbm_open: %s: %s", md5db_filename,
	    gdbm_strerror(gdbm_errno));
    }
}

/* Die gracefully */
static void sigdie()
{
    if (urldbf != NULL)
	gdbm_close(urldbf);
    if (hostdbf != NULL)
	gdbm_close(hostdbf);
    if (md5dbf != NULL)
	gdbm_close(md5dbf);

    (void) unlink(urldb_filename);
    (void) unlink(hostdb_filename);
    (void) unlink(md5db_filename);
    exit(0);
}

/* ---------------------------------------------------------------------- */

static void usage()
{
    fprintf(stderr, "Usage: gopherenum gopher-URL\n");
    exit(1);
}

int main(argc, argv)
     int argc;
     char **argv;
{
    URL *up;

    debug_init();
    for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {
	if (strncmp(*argv, "-D", 2) == 0) {
	    debug_flag(*argv);
	}
    }

    if (argc != 1)
	usage();

    signal(SIGTERM, sigdie);	/* Die gracefully */
    signal(SIGINT, sigdie);
    signal(SIGPIPE, sigdie);	/* Clean up on broken pipe */

    initialize();		/* Initialize */

    /* Grab the RootNode URL from the command line */
    if ((up = url_open(*argv)) == NULL || up->type != URL_GOPHER)
	usage();

    /* Mark the RootNode */
    tree_root = xstrdup(up->url);
    printf("%s\n", up->url);	/* Print tree root */

    process_url(up, start_depth);	/* Do the Enumeration recursively */

    url_close(up);		/* Clean up */
    finish_url();
    sigdie();
    /* NOTREACHED */
}
