static char rcsid[] = "candidate.c,v 1.19 1996/01/05 20:28:52 duane Exp";
/*
 *  candidate.c - Candidate Selection for the Essence system.
 *
 *  Darren Hardy, hardy@cs.colorado.edu, February 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "util.h"
#include "url.h"
#include "essence.h"

static char *stoptypes[MAX_TYPES];
static char *allowtypes[MAX_TYPES];

/*
 *  init_stoplist() - Initializes candidate selection step
 */
void init_stoplist()
{
	FILE *fp;
	int i;
	char buf[BUFSIZ], *s;

	/* Grab the allow list */
	i = 0;
	if (stoplist != NULL && (fp = fopen(stoplist, "r")) != NULL) {
		while (fgets(buf, BUFSIZ, fp)) {
			if (buf[0] == '#')
				continue;
			s = strtok(buf, " \t\n");
			if (s != NULL)
				stoptypes[i++] = strdup(s);
		}
		fclose(fp);
	}
	for (; i < MAX_TYPES; i++)
		stoptypes[i] = NULL;

	/* Grab the allow list */
	i = 0;
	if (allowlist != NULL && (fp = fopen(allowlist, "r")) != NULL) {
		while (fgets(buf, BUFSIZ, fp)) {
			if (buf[0] == '#')
				continue;
			s = strtok(buf, " \t\n");
			if (s != NULL)
				allowtypes[i++] = strdup(s);
		}
		fclose(fp);
	}
	for (; i < MAX_TYPES; i++)
		allowtypes[i] = NULL;

#ifdef NO_UNIX_RECURSE
	/* Add Directory by hand */
	for (i = 0; i < MAX_TYPES && stoptypes[i]; i++)
		if (!strcmp(stoptypes[i], "Directory"))
			break;
	if (stoptypes[i] == NULL)
		stoptypes[i] = strdup("Directory");
#endif
}

/*
 *  finish_stoplist() - Cleans up after candidate selection step
 */
void finish_stoplist()
{
	int i;

	for (i = 0; i < MAX_TYPES; i++) {
		if (stoptypes[i])
			xfree(stoptypes[i]);
		if (allowtypes[i])
			xfree(allowtypes[i]);
	}
}

/*
 *  allow_bytype() - Candidate selection on an object determined by
 *  its type.  Only allows objects with matching types.  Returns non-zero 
 *  if the object should be a candidate; returns zero otherwise.
 */
int allow_bytype(object)
     DataObject *object;
{
	int i;

	if (!object || !object->type)
		return (0);
	for (i = 0; allowtypes[i] != NULL && i < MAX_TYPES; i++) {
		if (!strcmp(allowtypes[i], object->type))
			return (1);
	}
	return (0);
}

/*
 *  stop_bytype() - Candidate selection on an object determined by
 *  its type.  Returns non-zero if the object should be not be a
 *  candidate; returns zero otherwise.
 */
int stop_bytype(object)
     DataObject *object;
{
	int i;

	if (!object || !object->type)
		return (0);
	for (i = 0; stoptypes[i] != NULL && i < MAX_TYPES; i++) {
		if (!strcmp(stoptypes[i], object->type))
			return (1);
	}
	return (0);
}

/*
 *  stop_byname() - Candidate selection on an object determined by
 *  its name.  Returns non-zero if the object should be not be a
 *  candidate; returns zero otherwise.
 */
int stop_byname(object)
     DataObject *object;
{
	return (0);
}

/*
 *  stop_byduplicate() - Candidate selection on an object determined by
 *  a duplicate in the database.  A duplicate need not be an exact match;
 *  it could be another version of the object (like the compressed
 *  version).  Returns non-zero if the object should not be a candidate; 
 *  returns zero otherwise.
 */
int stop_byduplicate(object)
     DataObject *object;
{
	char *s, *q, buf[BUFSIZ];
	int r;

	/* 
	 *  If the object is not nested, then check to see if it's in db
	 */
	if ((object->flags & F_NESTED) == 0) {
		r = duplicate_url(object->url->url);
		if (r)
			return (r);
	}
	/*
	 *  If the object is compressed then check to see if the
	 *  uncompressed version has already been done.
	 */
	if (!strcmp(object->type, "Compressed") ||
	    !strcmp(object->type, "GNUCompressed") ||
	    !strcmp(object->type, "GNUCompressedTar") ||
	    !strcmp(object->type, "CompressedTar")) {
		s = strdup(object->url->url);
		if ((q = strrchr(s, '.')) == NULL) {	/* strip .Z, .gz, etc */
			xfree(s);
			return (0);
		}
		*q = '\0';
		r = duplicate_url_any(s);
		xfree(s);
		return (r);
	}
	/*
	 *  Now check to see if the compressed version was already in the
	 *  database.
	 */
	sprintf(buf, "%s.Z", object->url->url);
	r = duplicate_url_any(buf);
	if (r)
		return (r);
	sprintf(buf, "%s.gz", object->url->url);
	r = duplicate_url_any(buf);
	if (r)
		return (r);

	/*
	 *  If we have a PostScript file, prefer the Dvi or Text version.
	 *  This is a hack and doesn't work in all cases.  For example,
	 *  won't remove .ps.Z + .dvi.Z.
	 */
	if (!strcmp(object->type, "PostScript")) {
		s = strdup(object->url->url);
		if ((q = strrchr(s, '.')) == NULL) {	/* strip .ps */
			xfree(s);
			return (0);
		}
		*q = '\0';
		sprintf(buf, "%s.dvi", s);	/* use DVI instead */
		r = duplicate_url_any(buf);
		sprintf(buf, "%s.txt", s);	/* use Text instead */
		xfree(s);
		if (r)
			return (r);
		r = duplicate_url_any(buf);
		if (r)
			return (r);
	}
	return (r);
}
