static char rcsid[] = "unnest.c,v 1.54 1996/01/05 20:29:00 duane Exp";
/*
 *  unnest.c - Presentation Unnesting for the Essence system.
 *
 *  DEBUG: section  65, level 1         Gatherer essence object unnesting
 *
 *  Darren Hardy, hardy@cs.colorado.edu, February 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <memory.h>
#include <errno.h>
#include <sys/param.h>
#include <sys/types.h>
#include <fcntl.h>
#include <dirent.h>
#include <time.h>
#include "util.h"
#include "essence.h"

/* Local functions */
static int default_extract();	/* extraction routines */
static int extract_Compressed();
static int extract_CompressedTar();
static int extract_GNUCompressed();
static int extract_GNUCompressedTar();
static int extract_ShellArchive();
static int extract_Tar();
static int extract_Uuencoded();
static int extract_PCZipped();
static int extract_MacBinHex();
static char *filename_to_url();

/* recursion routines */
static DataObjectList *recurse_unix_directory();
static DataObjectList *recurse_external();

static int mk_timestamp();

/* Local variables */
static char extracted_filename[MAXPATHLEN + 1];
static char unnestdir[MAXPATHLEN + 1];
static time_t timestamp;

/* 
 *  README for adding new Presentation Unnesting routines:
 *
 *  Methods for unnesting various types.  To add unnesting for a new type, 
 *  write a new extraction routine and add (type, proc) to this table.
 *
 *  If extract is NULL, then the nested type is treated as an exploder.
 *  The program type.unnest is run with the URL as the first argument,
 *  and the filename that contains the data as the second argument.
 *  The program generates a stream of SOIF templates that contain
 *  manual information about the data.  To add a new exploder, add
 *  the type to this table, and set the extract field to NULL, then
 *  write a SOIF generated and install it as type.unnest.
 */
struct nested_type {
	char *type;		/* Nested file type */
	int (*extract) ();	/* how to extract the data */
	int do_timestamp;	/* timestamp to determine extracted files */
} nested_types[] = {

/* normal unnesting that generates new objects to process */
	{
		"Compressed", extract_Compressed, 0
	},
	{
		"CompressedTar", extract_CompressedTar, 1
	},
	{
		"GNUCompressed", extract_GNUCompressed, 0
	},
	{
		"GNUCompressedTar", extract_GNUCompressedTar, 1
	},
	{
		"ShellArchive", extract_ShellArchive, 1
	},
	{
		"Tar", extract_Tar, 1
	},
	{
		"Uuencoded", extract_Uuencoded, 1
	},
/* still buggy */
	{
		"PCZipped", extract_PCZipped, 1
	},
/* exploders that generate a stream of SOIF tmpls to use as manual info */
	{
		"Exploder", NULL, 0
	},
	{
		"IAFA", NULL, 0
	},
	{
		"LSM", NULL, 0
	},
#ifdef USE_PCINDEX
	{
		"PCIndex", NULL, 0
	},
	{
		"Cica-PCIndex", NULL, 0
	},
	{
		"Garbo-PCIndex", NULL, 0
	},
	{
		"Garbo-Win-PCIndex", NULL, 0
	},
	{
		"Hobbes-PCIndex", NULL, 0
	},
	{
		"Lowell-PCIndex", NULL, 0
	},
	{
		"Oakland-PCIndex", NULL, 0
	},
	{
		"Umich-PCIndex", NULL, 0
	},
#endif
/* currently unsupported */
	{
		"MacBinHex", extract_MacBinHex, 1
	},
	{
		"PCCompressed", default_extract, 0
	},
	{
		NULL, default_extract, 0
	}
};

/*
 *  presentation_unnest() - Determines if the given object is has
 *  any presentation nesting.  If the object is nested, then it
 *  unnests the object until no more nesting is uncovered.  Returns
 *  the list of unnested objects when the object is successfully unnested;
 *  otherwise, returns NULL.
 */
DataObjectList *presentation_unnest(object)
     DataObject *object;
{
	static DataObjectList *dol = NULL;
	DataObjectList *walker = NULL, *tdol = NULL;
	int cur;
	DIR *dirp;
	struct dirent *dp;
	struct stat sb;
	char buf[MAXPATHLEN + 1], *s;
	static char unnestsubdir[MAXPATHLEN + 1];
	static int count = 0;

	Debug(65, 1, ("Unnesting: %s\n", object->url->url));

	/* Check to see if we simply need to recurse to unnest */
	if (object->url->type == URL_FILE &&
	    !strcmp(object->type, "Directory")) {
		return (recurse_unix_directory(object));
	}
	/* Locate the object's type in the unnesting configuration table */
	for (cur = 0; nested_types[cur].type != NULL; cur++) {
		if (!strcmp(nested_types[cur].type, object->type)) {
			break;
		}
	}
	if (nested_types[cur].type == NULL) {
		errorlog("pn: %s not a unnestable type\n", object->type);
		return (NULL);
	}
	if (nested_types[cur].extract == NULL) {
		char cmd[BUFSIZ];

		if (object_retrieve(object)) {
			return (NULL);
		}
		sprintf(cmd, "%s.unnest '%s' '%s'", nested_types[cur].type,
		    object->url->url, object->url->filename);
		return (recurse_external(object, cmd));
	}
	if (object->flags & F_NO_ACCESS)
		return (NULL);

	/* 
	 *  We check for the newly created files, two ways: 1) we generate
	 *  the new filenames based on the old filename (eg. foo.Z -> foo),
	 *  or we check stat(2) to see which files were created recently.
	 */
	if (nested_types[cur].do_timestamp && mk_timestamp())
		return (NULL);


	sprintf(unnestsubdir, "%s/%05d", unnestdir, ++count);
	if (mkdir(unnestsubdir, 0775) < 0) {
		log_errno2(__FILE__, __LINE__, unnestsubdir);
		return (NULL);
	}
	/* We'll need to create new file(s) in the unnestdir, so first cd */
	Debug(65, 1, ("presentation_unnest: chdir %s\n", unnestsubdir));
	if (chdir(unnestsubdir) < 0) {
		log_errno2(__FILE__, __LINE__, unnestsubdir);
		return (NULL);
	}
	/* 
	 *  Run the Extraction process, then gather the extracted 
	 *  files and return as DataObjectList 
	 */
	if ((*nested_types[cur].extract) (object)) {
		if (chdir(topdir) < 0) {	/* go back to previous directory */
			log_errno2(__FILE__, __LINE__, topdir);
		}
		return (NULL);	/* extraction failed */
	}
	/* Go back to previous directory */
	Debug(65, 1, ("presentation_unnest: chdir %s\n", topdir));
	if (chdir(topdir) < 0) {
		log_errno2(__FILE__, __LINE__, topdir);
	}
	/* We already know the extracted file */
	if (nested_types[cur].do_timestamp == 0) {
		char *s = filename_to_url(extracted_filename);
		dol = create_dol(s, object->flags | F_TEMPORARY | F_NESTED);
		xfree(s);
		if (dol == NULL)
			return (NULL);
		if (object->parent_url)
			dol->object->parent_url = strdup(object->parent_url);
		else
			dol->object->parent_url = strdup(object->url->url);
		return (dol);
	}
	/* 
	 *  We need to check the creation times to discover all of the
	 *  files generated.  
	 */
	dol = walker = NULL;
	if ((dirp = opendir(unnestsubdir)) == NULL) {
		log_errno2(__FILE__, __LINE__, unnestsubdir);
		errorlog("presentation_unnest: Cannot open directory: %s\n",
		    unnestsubdir);
		return (NULL);
	}
	while ((dp = readdir(dirp)) != NULL) {
		if (!strcmp(dp->d_name, ".") || !strcmp(dp->d_name, ".."))
			continue;
		sprintf(buf, "%s/%s", unnestsubdir, dp->d_name);
		if (lstat(buf, &sb) < 0) {
			log_errno2(__FILE__, __LINE__, buf);
			errorlog("presentation_unnest: Cannot stat: %s\n", buf);
			continue;
		}
		if (timestamp > sb.st_ctime)	/* old file */
			continue;
		if (!strcmp(object->basename, dp->d_name))	/* same file */
			continue;

		s = filename_to_url(buf);
		tdol = create_dol(s, object->flags | F_TEMPORARY | F_NESTED);
		xfree(s);
		if (tdol == NULL)
			continue;
		if (object->parent_url)
			tdol->object->parent_url = strdup(object->parent_url);
		else
			tdol->object->parent_url = strdup(object->url->url);
		if (walker == NULL) {
			dol = tdol;
			walker = dol;
		} else {
			walker->next = tdol;
			walker = walker->next;
		}
	}
	closedir(dirp);
	return (dol);
}

/*
 *  is_nested_type() - Determines if the given type is has any presentation 
 *  nesting.  Returns a non-zero if true; zero otherwise.
 */
int is_nested_type(t)
     char *t;
{
	int i;

	if (t == NULL)
		return (0);
	if (!strcmp("Directory", t))
		return (1);
	for (i = 0; nested_types[i].type; i++)
		if (!strcmp(t, nested_types[i].type))
			return (1);
	return (0);
}


void init_presentation_unnest()
{
	char *s = tempnam(tmpdir, "unnest");
	if (s == NULL) {
		fatal("Cannot create tmp directory for unnest\n");
	}
	strcpy(unnestdir, s);
	xfree(s);
	if (mkdir(unnestdir, 0755) < 0) {
		log_errno2(__FILE__, __LINE__, unnestdir);
		fatal("init_presentation_unnest: Cannot create %s\n",
		    unnestdir);
	}
	memset(extracted_filename, '\0', MAXPATHLEN + 1);
}

void finish_presentation_unnest()
{
	char buf[BUFSIZ];
	sprintf(buf, "/bin/rm -rf '%s'", unnestdir);
	Debug(65, 1, ("finish_presentation_unnest: %s\n", buf));
	run_cmd(buf);
	if (extracted_filename[0])
		if (unlink(extracted_filename) < 0)
			log_errno2(__FILE__, __LINE__, extracted_filename);
}

static int mk_timestamp()
{
	timestamp = time(NULL);
	(void) sleep(1);
	return (0);
}

static DataObjectList *recurse_unix_directory(object)
     DataObject *object;
{
	static DataObjectList *dol = NULL;
	DataObjectList *walker, *tdol;
	struct dirent *dp;
	DIR *dirp;
	char buf[MAXPATHLEN + 1];

	if ((dirp = opendir(object->url->filename)) == NULL)
		return (NULL);
	dol = walker = NULL;
	while ((dp = readdir(dirp)) != NULL) {
		if (!strcmp(dp->d_name, ".") || !strcmp(dp->d_name, ".."))
			continue;
		sprintf(buf, "%s/%s", object->url->url, dp->d_name);
		if ((tdol = create_dol(buf, object->flags)) == NULL)
			continue;
		if (object->parent_url)
			tdol->object->parent_url = strdup(object->parent_url);
		if (walker == NULL) {
			dol = tdol;
			walker = dol;
		} else {
			walker->next = tdol;
			walker = walker->next;
		}
	}
	closedir(dirp);
	return (dol);
}

/*
 *  extraction routines
 *
 *  each function extracts the given object into the current directory
 *  (which is tmpdir) and returns 0 on success, non-zero on error.
 */
static int default_extract(object)
     DataObject *object;
{
	errorlog("Extraction for %s is UNIMPLEMENTED.\n", object->type);
	extracted_filename[0] = '\0';
	return (1);
}

static int extract_Compressed(object)
     DataObject *object;
{
	char cmd[BUFSIZ], *s;

	sprintf(extracted_filename, "%s/%s", tmpdir, object->basename);
	if ((s = strrchr(extracted_filename, '.')) != NULL)
		*s = '\0';
	sprintf(cmd, "%s < \"%s\" > \"%s\"",
	    CMD_UNCOMPRESS, object->url->filename, extracted_filename);
	Debug(65, 1, ("extract_Compressed: %s\n", cmd));
	return (do_system(cmd));
}

static int extract_CompressedTar(object)
     DataObject *object;
{
	char cmd[BUFSIZ];

	sprintf(cmd, "%s < \"%s\" | %s -xf - ", CMD_UNCOMPRESS,
	    object->url->filename, CMD_TAR);
	Debug(65, 1, ("extract_CompressedTar: %s\n", cmd));
	return (do_system(cmd));
}

static int extract_GNUCompressed(object)
     DataObject *object;
{
	char cmd[BUFSIZ], *s;

	sprintf(extracted_filename, "%s/%s", tmpdir, object->basename);
	if ((s = strrchr(extracted_filename, '.')) != NULL)
		*s = '\0';
	sprintf(cmd, "%s -dc \"%s\" > \"%s\"", CMD_GZIP,
	    object->url->filename, extracted_filename);
	Debug(65, 1, ("extract_GNUCompressed: %s\n", cmd));
	return (do_system(cmd));
}

static int extract_GNUCompressedTar(object)
     DataObject *object;
{
	char cmd[BUFSIZ];

	sprintf(cmd, "%s -dc \"%s\" | %s -xf - ", CMD_GZIP,
	    object->url->filename, CMD_TAR);
	Debug(65, 1, ("extract_GNUCompressedTar: %s\n", cmd));
	return (do_system(cmd));
}

static int extract_Tar(object)
     DataObject *object;
{
	char cmd[BUFSIZ];

	sprintf(cmd, "%s -xf - < \"%s\" ", CMD_TAR, object->url->filename);
	Debug(65, 1, ("extract_Tar: %s\n", cmd));
	return (do_system(cmd));
}

static int extract_Uuencoded(object)
     DataObject *object;
{
	char cmd[BUFSIZ];

	sprintf(cmd, "uudecode \"%s\" ", object->url->filename);
	Debug(65, 1, ("extract_Uuencoded: %s\n", cmd));
	return (do_system(cmd));
}

static int extract_ShellArchive(object)
     DataObject *object;
{
	char cmd[BUFSIZ];

	sprintf(cmd, "unshar < \"%s\" ", object->url->filename);
	Debug(65, 1, ("extract_ShellArchive: %s\n", cmd));
	return (do_system(cmd));
}

static int extract_MacBinHex(object)
     DataObject *object;
{
	char cmd[BUFSIZ];

	sprintf(cmd, "hexbin < \"%s\" ", object->url->filename);
	Debug(65, 1, ("extract_MacBinHex: %s\n", cmd));
	return (do_system(cmd));
}

/*
 *  extract-PCZippped - extracts a ZIP archive using UnZip version 5.0 
 *  written by the Info-ZIP workgroup (David Kirschbaum, consolidator).
 *  Software was posted to comp.sources.unix.
 */
static int extract_PCZipped(object)
     DataObject *object;
{
	char cmd[BUFSIZ];

	sprintf(cmd, "%s -qq -n -o -a \"%s\"", CMD_UNZIP, object->url->filename);
	Debug(65, 1, ("extract_PCZipped: %s\n", cmd));
	return (do_system(cmd));
}

static char *filename_to_url(fname)
     char *fname;
{
	static char *p;

	p = xmalloc(BUFSIZ);
	sprintf(p, "file://%s%s", getfullhostname(), fname);
	return (p);
}

/*
 *  unnest_dbcheck() - Returns non-zero if the template has not changed.
 *  If the template contains a Last-Modification-Time, or an MD5,
 *  then grab the template from the PRODUCTION.gdbm database (if it
 *  exists) and compare to see if the template has changed.  Otherwise,
 *  generate an MD5 based on the printed SOIF version of the template,
 *  and use that as a basis for the db check.
 */
static int unnest_dbcheck(t)
     Template *t;
{
	AVPair *avp;
	int ts;

	if ((avp = extract_AVPair(t->list, T_MD5)) != NULL) {
		return (dbcheck_md5(t->url, avp->value));
	}
	if ((avp = extract_AVPair(t->list, T_LMT)) != NULL) {
		ts = atoi(avp->value);
		return (dbcheck_timestamp(t->url, ts));
	}
	return (0);
}

/*
 *  recurse_external() - Runs an external process that generates a stream
 *  of SOIF objects.  For each SOIF object, it converts it to an
 *  appropriate DataObject and adds it to an object list whose objects 
 *  each have an AVList.
 */
static DataObjectList *recurse_external(object, cmd)
     DataObject *object;
     char *cmd;
{
	static DataObjectList *dol = NULL;
	DataObjectList *walker, *tdol;
	Template *t;
	FILE *fp;

	Debug(65, 1, ("recurse_external: %s: %s\n", object->url->url, cmd));
	if ((fp = popen(cmd, "r")) == NULL) {
		log_errno2(__FILE__, __LINE__, cmd);
		return (NULL);
	}
	dol = walker = NULL;
	while (1) {
		/*
		 *  Since the dbcheck routines need the SOIF library,
		 *  we need to explicitly re-init the SOIF library
		 *  on each call.
		 */
		init_parse_template_file(fp);
		if ((t = parse_template()) == NULL) {
			if (is_parse_end_of_input())
				break;	/* EOF */
			else
				continue;	/* Error, try again */
		}
		finish_parse_template();

		/*
		 *  If the user wants fake-md5s then compute an
		 *  MD5 based on the input SOIF template.
		 */
		if (do_fakemd5s && extract_AVPair(t->list, T_MD5) == NULL) {
			Buffer *b;
			char *newmd5;
			extern char *get_md5_string();

			b = init_print_template(NULL);
			print_template(t);
			newmd5 = get_md5_string(b->data, b->length);
			finish_print_template();
			FAST_add_AVList(t->list, T_MD5, newmd5, 32);
			xfree(newmd5);
		}
		if (unnest_dbcheck(t)) {
			free_template(t);
			continue;
		}
		tdol = create_dol(t->url, F_NO_ACCESS | F_MANUAL);
		if (tdol == NULL) {
			free_template(t);
			continue;
		}
		tdol->object->parent_url = NULL;
		tdol->object->type = NULL;
		tdol->object->ttype = strdup(t->template_type);
		tdol->object->avl = t->list;
		if (walker == NULL) {
			walker = dol = tdol;
		} else {
			walker->next = tdol;
			walker = walker->next;
		}
		/* free template manually, since we want to keep the list */
		xfree(t->template_type);
		xfree(t->url);
		xfree(t);
	}
	pclose(fp);
	return (dol);
}
