static char rcsid[] = "post_process.c,v 1.19 1996/01/05 20:28:57 duane Exp";
/*
 *  post_process.c - Post Processing routines for the Essence system
 *
 *  Duane Wessels, wessels@cs.colorado.edu, May 1995
 *
 *  DEBUG: section  66, level 1         Gatherer essence post-summarizing
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *    The Harvest software was developed by the Internet Research Task
 *    Force Research Group on Resource Discovery (IRTF-RD):
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder.
 *          Duane Wessels of the University of Colorado at Boulder.
 *  
 *    This copyright notice applies to software in the Harvest
 *    ``src/'' directory only.  Users should consult the individual
 *    copyright notices in the ``components/'' subdirectories for
 *    copyright information about other software bundled with the
 *    Harvest source code distribution.
 *  
 *  TERMS OF USE
 *    
 *    The Harvest software may be used and re-distributed without
 *    charge, provided that the software origin and research team are
 *    cited in any use of the system.  Most commonly this is
 *    accomplished by including a link to the Harvest Home Page
 *    (http://harvest.cs.colorado.edu/) from the query page of any
 *    Broker you deploy, as well as in the query result pages.  These
 *    links are generated automatically by the standard Broker
 *    software distribution.
 *    
 *    The Harvest software is provided ``as is'', without express or
 *    implied warranty, and with no support nor obligation to assist
 *    in its use, correction, modification or enhancement.  We assume
 *    no liability with respect to the infringement of copyrights,
 *    trade secrets, or any patents, and are not responsible for
 *    consequential damages.  Proper use of the Harvest software is
 *    entirely the responsibility of the user.
 *  
 *  DERIVATIVE WORKS
 *  
 *    Users may make derivative works from the Harvest software, subject 
 *    to the following constraints:
 *  
 *      - You must include the above copyright notice and these 
 *        accompanying paragraphs in all forms of derivative works, 
 *        and any documentation and other materials related to such 
 *        distribution and use acknowledge that the software was 
 *        developed at the above institutions.
 *  
 *      - You must notify IRTF-RD regarding your distribution of 
 *        the derivative work.
 *  
 *      - You must clearly notify users that your are distributing 
 *        a modified version and not the original Harvest software.
 *  
 *      - Any derivative product is also subject to these copyright 
 *        and use restrictions.
 *  
 *    Note that the Harvest software is NOT in the public domain.  We
 *    retain copyright, as specified above.
 *  
 *  HISTORY OF FREE SOFTWARE STATUS
 *  
 *    Originally we required sites to license the software in cases
 *    where they were going to build commercial products/services
 *    around Harvest.  In June 1995 we changed this policy.  We now
 *    allow people to use the core Harvest software (the code found in
 *    the Harvest ``src/'' directory) for free.  We made this change
 *    in the interest of encouraging the widest possible deployment of
 *    the technology.  The Harvest software is really a reference
 *    implementation of a set of protocols and formats, some of which
 *    we intend to standardize.  We encourage commercial
 *    re-implementations of code complying to this set of standards.  
 *  
 */

#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>

#include "util.h"
#include "template.h"
#include <sys/types.h>
#include <GNUregex.h>

#include "post_process.h"

Rule *PPRules = NULL;


/*
 * my_write()
 *
 * a persistent write() for sockets and pipes.  Don't return until
 * all bytes have been written, or an error condition.
 */
static int my_write(fd, ptr, nbytes)
     register int fd, nbytes;
     register char *ptr;
{
	static int nleft, nwritten;

	nleft = nbytes;
	while (nleft > 0) {
		nwritten = write(fd, ptr, nleft > 8192 ? 8192 : nleft);
		if (nwritten <= 0) {
			return (nwritten);
		}
		nleft -= nwritten;
		ptr += nwritten;
	}
	return (nbytes - nleft);
}

/*
 * my_read()
 *
 * a persistent read() for sockets and pipes.  Don't return until
 * all bytes have been read, or an error condition.
 */
static int my_read(fd, ptr, nbytes)
     register int fd, nbytes;
     register char *ptr;
{
	static int nleft, nread;

	nleft = nbytes;
	while (nleft > 0) {
		nread = read(fd, ptr, nleft);
		if (nread < 0)
			return (nread);
		else if (nread == 0)
			break;
		nleft -= nread;
		ptr += nread;
	}
	return (nbytes - nleft);
}

/* 
 *    do_command_io (argv, writebuf, bytesout, bytesin)
 *
 *      Writes 'bytesout' of 'writebuf' to a forked processes which
 *      executes the command in 'argv'.  The command will read from
 *      stdin and write to stdout, which will be a tmpfile.
 *      The function returns a malloc'd buffer that contains the 
 *      command output, and sets *bytesin accordingly.
 */
static char *do_command_io(argv, writebuf, bytesout, bytesin)
     char **argv;
     char *writebuf;
     int bytesout;
     int *bytesin;
{
	int n;
	char *tfile = NULL;
	int p[2];
	int fd;
	int pid = -1;
	int status;
	struct stat sb;
	char *inbuf = NULL;

	Debug(66, 5, ("do_command_io: Running '%s'\n", *argv));

	if ((tfile = tempnam(0, 0)) == (char *) 0)
		goto do_cmd_done;

	if (pipe(p) < 0) {
		log_errno2(__FILE__, __LINE__, "pipe");
		goto do_cmd_done;
	}
	if ((pid = fork()) == 0) {	/* child */
		fd = open(tfile, O_WRONLY | O_TRUNC | O_CREAT, 0660);
		if (fd < 0) {
			log_errno2(__FILE__, __LINE__, tfile);
			_exit(1);
		}
		close(1);
		dup(fd);
		close(fd);
		close(0);
		dup(p[0]);
		close(p[0]);
		close(p[1]);
		execvp(*argv, argv);
		log_errno2(__FILE__, __LINE__, *argv);
		_exit(1);
	}
	close(p[0]);
	my_write(p[1], writebuf, bytesout);
	close(p[1]);
	waitpid(pid, &status, 0);
	Debug(66, 5, ("do_command_io: '%s' returned %d\n", *argv, status >> 8));
	if (stat(tfile, &sb) < 0) {
		log_errno2(__FILE__, __LINE__, tfile);
		goto do_cmd_done;
	}
	if (sb.st_size <= 0) {
		Debug(66, 1, ("do_command_io: '%s' wrote no data\n", *argv));
		goto do_cmd_done;
	}
	fd = open(tfile, O_RDONLY);
	if (fd < 0) {
		log_errno2(__FILE__, __LINE__, tfile);
		goto do_cmd_done;
	}
	inbuf = (char *) xmalloc(sb.st_size);
	n = my_read(fd, inbuf, sb.st_size);
	if (n < 0) {
		log_errno2(__FILE__, __LINE__, "read");
		xfree(inbuf);
		inbuf = 0;
		goto do_cmd_done;
	}
	*bytesin = n;

      do_cmd_done:
	close(fd);
	if (tfile)
		unlink(tfile);
	xfree(tfile);
	return inbuf;
}


/*
 * check_condition()
 *
 * Check a single condition from the rules.  Return 1 if the condition
 * holds, or 0 if it fails.  Supported conditions are string equals
 * and regular expression matching.
 */
static int check_condition(c, T)
     Cond *c;
     Template *T;
{
	char *attr = NULL;
	char *c_val = NULL;
	char *t_val = NULL;
	AVPair *pair = NULL;
	int ret = NULL;
	regex_t compiled_pattern;

	if (!c)
		goto finish_check_cond;
	if (!c->attr)
		goto finish_check_cond;
	if (!c->value)
		goto finish_check_cond;
	if (!c->attr->word)
		goto finish_check_cond;
	if (!c->value->word)
		goto finish_check_cond;

	attr = xstrdup(c->attr->word);
	c_val = xstrdup(c->value->word);

	if (!strcasecmp(attr, "url"))
		t_val = xstrdup(T->url);
	else {
		pair = extract_AVPair(T->list, attr);
		if (pair == NULL)
			goto finish_check_cond;
		t_val = xstrdup(pair->value);
	}

	Debug(66, 5, ("check_condition: attr=%s\n", attr));
	Debug(66, 5, ("check_condition: c_val=%s\n", c_val));
	Debug(66, 5, ("check_condition: t_val=%s\n", t_val));

	switch (c->op) {
	case EQUALS:
		ret = (strcasecmp(t_val, c_val) == 0);
		break;
	case NOTEQ:
		ret = (strcasecmp(t_val, c_val) != 0);
		break;
	case REGEX:
		regcomp(&compiled_pattern, c_val, REG_EXTENDED);
		ret = (regexec(&compiled_pattern, t_val, 0, 0, 0) == 0);
		regfree(&compiled_pattern);
		break;
	case NOTRE:
		regcomp(&compiled_pattern, c_val, REG_EXTENDED);
		ret = (regexec(&compiled_pattern, t_val, 0, 0, 0) != 0);
		regfree(&compiled_pattern);
		break;
	default:
		ret = 0;
		break;
	}


      finish_check_cond:
	Debug(66, 1, ("check_condition: returning %d\n", ret));
	xfree(t_val);
	xfree(c_val);
	xfree(attr);
	return ret;
}


/*
 * check_conditions()
 *
 * Check a group of conditions from the rules.  Return 1 if the conditions
 * hold, or 0 if they do not.  Conditions can be joined with AND, OR.
 * Individual conditions are evaluated in left->right order.  Complex
 * AND/OR groupings are not possible.
 */
static int check_conditions(C, T)
     Cond *C;
     Template *T;
{
	Cond *c = NULL;
	int this_val;
	int running_val;
	int lastop = -1;


	for (c = C; c; c = c->next) {
		this_val = check_condition(c, T);
		if (lastop != -1)
			switch (lastop) {
			case AND:
				running_val = running_val && this_val;
				break;
			case OR:
				running_val = running_val || this_val;
				break;
			default:
				fprintf(stderr, "Unknown condition op: %d\n", lastop);
				break;
		} else {
			running_val = this_val;
		}
		lastop = c->nextop;
	}
	return running_val;
}

/* 
 * do_assign_inst (T, attrs, args)
 *
 * attrs->word is an attribute name
 * args->word is the attribute value
 * 
 * Simply add or replace this A/V pair in the Template
 */
static int do_assign_inst(T, attrs, args)
     Template *T;
     Word *attrs;
     Word *args;
{
	if (!attrs)
		return 0;
	if (!attrs->word)
		return 0;
	if (!args)
		return 0;
	if (!args->word)
		return 0;

	Debug(66, 5, ("do_assign_inst: %s = %s\n", attrs->word, args->word));

	add_AVList(T->list, attrs->word, args->word, strlen(args->word));
	return 1;
}

/* 
 * do_pipe_inst (T, attrs, args)
 *
 * attrs->word is an attribute name
 * args is a list of words that make up a command.
 * 
 * Open a pipe to the command and write the attribute value.  The command
 * output replaces the attribute value in the Template.
 */
static int do_pipe_inst(T, attrs, args)
     Template *T;
     Word *attrs;
     Word *args;
{
	AVPair *pr = NULL;
	int argc;
	int i, n;
	char **argv = NULL;
	Word *w = NULL;
	int ret = NULL;
	char *inbuf = NULL;
	char *t = NULL;

	if (!attrs)
		return 0;
	if (!attrs->word)
		return 0;
	if (!args)
		return 0;
	if (!args->word)
		return 0;

	Debug(66, 5, ("do_pipe_inst: %s | %s ...\n", attrs->word, args->word));

	if (strcasecmp(attrs->word, "url") == 0) {
		pr = (AVPair *) xmalloc(sizeof(AVPair));
		pr->vsize = strlen(T->url) + 2;
		pr->value = xmalloc(pr->vsize);
		sprintf(pr->value, "%s\n", T->url);
	} else {
		pr = extract_AVPair(T->list, attrs->word);
	}

	if (!pr) {
		Debug(66, 5, ("Attribute '%s' not found.\n", attrs->word));
		return 0;
	}
	for (argc = 0, w = args; w; w = w->next)
		argc++;
	argv = (char **) xmalloc((argc + 1) * sizeof(char *));
	for (i = 0, w = args; w; i++, w = w->next)
		*(argv + i) = xstrdup(w->word);
	*(argv + argc) = NULL;

	inbuf = do_command_io(argv, pr->value, pr->vsize, &n);
	if (inbuf == (char *) NULL)
		goto do_pipe_done;

	ret = 1;
	if (strcasecmp(attrs->word, "url") == 0) {
		if ((t = strchr(inbuf, '\n')))
			*t = '\0';
		xfree(T->url);
		T->url = xstrdup(inbuf);
	} else {
		add_AVList(T->list, attrs->word, inbuf, n);
	}

      do_pipe_done:
	for (i = 0; i < argc; i++)
		xfree(*(argv + i));
	xfree(argv);
	xfree(inbuf);
	if (strcasecmp(attrs->word, "url") == 0)
		free_AVPair(pr);
	return ret;
}

/* 
 * do_bang_inst (T, attrs, args)
 *
 * attrs is a list of attribute names.
 * args is a list of words that make up a command.
 * 
 * Open a pipe to the command and write the SOIF A/V pairs for the
 * given attributes.  The output of the command is also SOIF A/V pairs
 * which is incoprorated into the template.  Existing attributes will
 * be overwritten.
 */
static int do_bang_inst(T, attrs, args)
     Template *T;
     Word *attrs;
     Word *args;
{
	AVPair *pr = NULL;
	int argc;
	int i, n;
	char **argv = NULL;
	Word *w = NULL;
	int ret = NULL;
	char *inbuf = NULL;
	Buffer *outb = NULL;
	Template *N = NULL;

	if (!attrs)
		return 0;
	if (!attrs->word)
		return 0;
	if (!args)
		return 0;
	if (!args->word)
		return 0;

	Debug(66, 5, ("do_bang_inst: %s ... ! %s ...\n", attrs->word, args->word));

	N = create_template(0, T->url);

	for (w = attrs; w; w = w->next) {
		pr = extract_AVPair(T->list, w->word);
		if (!pr)
			continue;
		if (!N->list) {
			N->list = create_AVList(pr->attribute, pr->value, pr->vsize);
		} else {
			add_AVList(N->list, pr->attribute, pr->value, pr->vsize);
		}
	}

	outb = init_print_template(0);
	print_template(N);
	free_template(N);

	for (argc = 0, w = args; w; w = w->next)
		argc++;
	argv = (char **) xmalloc((argc + 1) * sizeof(char *));
	for (i = 0, w = args; w; i++, w = w->next)
		*(argv + i) = xstrdup(w->word);
	*(argv + argc) = NULL;

	Debug(66, 1, ("Writing this data (%d bytes) to %s:%s\n",
		outb->length, *argv, outb->data));

	inbuf = do_command_io(argv, outb->data, outb->length, &n);
	finish_print_template();
	if (!inbuf)
		goto do_bang_done;

	init_parse_template_string(inbuf, n);
	N = parse_template();
	finish_parse_template();
	if (!N)
		goto do_bang_done;

	merge_AVList(T->list, N->list);
	free_template(N);

	ret = 1;

      do_bang_done:
	for (i = 0; i < argc; i++)
		xfree(*(argv + i));
	xfree(argv);
	xfree(inbuf);
	return ret;
}


/*
 * do_instructions()
 *
 * Run the instructions from a rule
 */
static int do_instructions(I, T)
     Inst *I;
     Template *T;
{
	Inst *i = NULL;
	int ret = 0;

	Debug(66, 1, ("do_instructions: %s\n", T->url));
	for (i = I; i; i = i->next) {
		Debug(66, 1, ("Doing instruction type %d\n", i->op));
		switch (i->op) {
		case ASSIGN:
			ret = do_assign_inst(T, i->attrs, i->args);
			break;
		case PIPE:
			ret = do_pipe_inst(T, i->attrs, i->args);
			break;
		case BANG:
			ret = do_bang_inst(T, i->attrs, i->args);
			break;
		case DELETE:
			ret = SUMMARIZE_DONT_ADD_OBJECT;
			break;
		default:
			ret = 0;
			break;
		}
	}
	return ret;
}

/*
 * post_process() - Post Process a SOIF template.
 */
int post_process(T)
     Template *T;
{
	Rule *r = NULL;
	int ret = 0;

	Debug(66, 1, ("post_process:  Starting: %s\n", T->url));
	for (r = PPRules; r; r = r->next) {
		if (!check_conditions(r->cond, T))
			continue;
		Debug(66, 2, ("post_process: Munging: %s\n", T->url));
		ret = do_instructions(r->inst, T);
	}
	Debug(66, 2, ("post_process: Returning: %d\n", ret));
	return ret;
}


extern int yyparse();
extern FILE *yyin;

/*
 * pp_parse_rules ()
 *
 * opens and parses a file of post-processing rules.
 *
 * return 1 on failure, 0 on success
 */
int pp_parse_rules(filename)
     char *filename;
{
	FILE *fp = NULL;
	int ret;

	Log("reading post-processing rules from %s\n", filename);

	fp = fopen(filename, "r");
	if (!fp) {
		log_errno2(__FILE__, __LINE__, filename);
		return 1;
	}
	yyin = fp;

	ret = yyparse();
	fclose(fp);
	yyin = (FILE *) NULL;
	Debug(66, 1, ("returning %d from yyparse\n", ret));
	return ret;
}
