static char rcsid[] = "cleandb.c,v 1.22 1996/01/17 10:07:46 duane Exp";
/*
 *  cleandb - Cleans up a GDBM database to prepare it for production use.
 *  Verifies that each SOIF template is legal, verifies that each
 *  SOIF template is printed with the libtemplate routine, and verifies
 *  that each SOIF object contains an 'Update-Time', and 'Gatherer-*' attr.
 *
 *  Usage: cleandb [-truncate] file
 *
 *  Darren Hardy, hardy@cs.colorado.edu, May 1994
 *
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <gdbm.h>
#include "util.h"
#include "template.h"

/*
 *  MAX_BYTES - Maximum number of bytes allowed in a field during -truncate.
 */
#ifndef MAX_BYTES
#define MAX_BYTES	(8 * 1024)
#endif

/* Local functions */
static void usage();
static void check_template();
static int do_truncate = 0;

static void usage()
{
	fprintf(stderr, "Usage: cleandb [-truncate] db\n");
	exit(1);
}

static void remove_keywords(t)
     Template *t;
{
	AVList *walker = t->list;

	/* Remove any keyword data that's longer than MAX_BYTES */
	while (walker) {
		if (strstr(walker->data->attribute, "eyword")) {
			if (walker->data->vsize > MAX_BYTES) {
				Log("Trimmed %d bytes from %s attribute (%s)\n",
				    walker->data->vsize - MAX_BYTES, t->url,
				    walker->data->attribute);
				walker->data->vsize = MAX_BYTES;
			}
		}
		walker = walker->next;
	}
}

static void check_template(dbf, k, d)
     GDBM_FILE dbf;
     datum k;
     datum d;
{
	Template *t = NULL;
	Buffer *b = NULL;
	datum nd;


	/* Parse the template to ensure correctness */
	init_parse_template_string(d.dptr, d.dsize);
	t = parse_template();
	finish_parse_template();

	if (t == NULL) {	/* Unparsable; delete it */
		Log("Deleting invalid SOIF: Unparsable: %s\n", k.dptr);
		gdbm_delete(dbf, k);
		xfree(k.dptr);
		xfree(d.dptr);
		return;
	}
	if (extract_AVPair(t->list, T_UPDATE) == NULL) {
		Log("Deleting invalid SOIF: No %s: %s\n", T_UPDATE, k.dptr);
		gdbm_delete(dbf, k);
		xfree(k.dptr);
		xfree(d.dptr);
		free_template(t);
		return;
	}
	if (extract_AVPair(t->list, T_GHOST) == NULL) {
		Log("Deleting invalid SOIF: No %s: %s\n", T_GHOST, k.dptr);
		gdbm_delete(dbf, k);
		xfree(k.dptr);
		xfree(d.dptr);
		free_template(t);
		return;
	}
	if (extract_AVPair(t->list, T_GNAME) == NULL) {
		Log("Deleting invalid SOIF: No %s: %s\n", T_GNAME, k.dptr);
		gdbm_delete(dbf, k);
		xfree(k.dptr);
		xfree(d.dptr);
		free_template(t);
		return;
	}
	if (extract_AVPair(t->list, T_GVERSION) == NULL) {
		Log("Deleting invalid SOIF: No %s: %s\n", T_GVERSION, k.dptr);
		gdbm_delete(dbf, k);
		xfree(k.dptr);
		xfree(d.dptr);
		free_template(t);
		return;
	}
	if (do_truncate)
		remove_keywords(t);


	/* Verify that the stored data is the same as the parsed template */
	b = init_print_template(NULL);
	print_template(t);
	nd.dptr = b->data;
	nd.dsize = b->length;

	if (d.dsize != nd.dsize)	/* Different templates, replace */
		(void) gdbm_store(dbf, k, nd, GDBM_REPLACE);

	/* Clean up */
	xfree(k.dptr);
	xfree(d.dptr);
	finish_print_template();
	free_template(t);
}

int main(argc, argv)
     int argc;
     char *argv[];
{
	GDBM_FILE dbf;
	datum d, k, nk;

	init_log3("cleandb", stdout, stderr);

	if (argc > 1 && !strcmp(argv[1], "-truncate")) {
		argc--;
		argv++;
		do_truncate = 1;
	}
	if (argc != 2)
		usage();

	dbf = gdbm_open(argv[1], 0, GDBM_WRITER, 0644, NULL);
	if (dbf == NULL) {
		errorlog("gdbm_open: %s: %s\n", argv[1],
		    gdbm_strerror(gdbm_errno));
		log_errno(argv[1]);
		usage();
	}
	k = gdbm_firstkey(dbf);
	while (k.dptr) {
		nk = gdbm_nextkey(dbf, k);
		d = gdbm_fetch(dbf, k);
		check_template(dbf, k, d);
		k = nk;
	}
	gdbm_close(dbf);
	exit(0);
}
