diff options
Diffstat (limited to 'maildrop/re.C')
| -rw-r--r-- | maildrop/re.C | 888 | 
1 files changed, 888 insertions, 0 deletions
| diff --git a/maildrop/re.C b/maildrop/re.C new file mode 100644 index 0000000..f2ab94e --- /dev/null +++ b/maildrop/re.C @@ -0,0 +1,888 @@ +#include	"config.h" +#include	"re.h" +#include	"mio.h" +#include	"regexpnode.h" +#include	"rematch.h" +#include	"funcs.h" +#include	"buffer.h" +#include	<ctype.h> + + +////////////////////////////////////////////////////////////////////////////// +// +// Create sets for the [:is....:] codes. +// + +static void mk_alnum(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isalnum(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_alpha(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isalpha(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_cntrl(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (iscntrl(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_digit(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isdigit(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_graph(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isgraph(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_lower(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (islower(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_print(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isprint(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_punct(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (ispunct(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_space(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isspace(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_upper(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isupper(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_xdigit(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (isxdigit(i)) +			p[i/8] |= 1 << (i % 8); +} + +static void mk_wbreak(unsigned char *p) +{ +register unsigned i; + +	for (i=0; i<256; i++) +		if (!isalnum(i) && i != '_') +			p[i/8] |= 1 << (i % 8); +} + +static const char *const is_setname[]={ +	":alnum:", +	":alpha:", +	":cntrl:", +	":digit:", +	":graph:", +	":lower:", +	":print:", +	":punct:", +	":space:", +	":upper:", +	":xdigit:", +	":wbreak:"}; + +static void (*is_setfunc[])(unsigned char *)={ +	mk_alnum, +	mk_alpha, +	mk_cntrl, +	mk_digit, +	mk_graph, +	mk_lower, +	mk_print, +	mk_punct, +	mk_space, +	mk_upper, +	mk_xdigit, +	mk_wbreak}; + +Re::Re() : chainedre(0), prevre(0), nodes(0), first(0), isCaret(0) +{ +} + +Re::~Re() +{ +	init(); +} + +void Re::init() +{ +	if (chainedre)	delete chainedre; +	chainedre=0; + +RegExpNode *n; + +	while ((n=nodes) != 0) +	{ +		nodes=n->next; +		delete n; +	} +} + +inline RegExpNode	*Re::allocnode() +{ +RegExpNode	*n; + +	if ((n=new RegExpNode(nextid++)) == 0) +		outofmem(); +	n->next=nodes; nodes=n; return(n); +} + +int Re::Compile(const char *ptr, int caseflag, int &errindex) +{ +	if (*ptr == '^') +	{ +		if (CompileS(ptr+1, caseflag, errindex))	return (-1); +		isCaret=1; +		return (0); +	} + +	if (CompileS("[.\n]*", 1, errindex) < 0)	return (-1); +	if ((chainedre=new Re) == 0) +		outofmem(); +	isDummy=1; +	chainedre->prevre=this; +	return (chainedre->CompileS(ptr, caseflag, errindex)); +} + +int Re::CompileS(const char *ptr, int caseflag, int &errindex) +{ +	expr=ptr; +	origexpr=expr; +	init(); +	nextid=0; +	first=0; +	isCaret=0; +	isDummy=0; +	casesensitive=caseflag; +	matchFull=0; + +int	rc=0; + +	try +	{ +	RegExpNode **p=CompileOrClause(&first); + +		if (*expr == '!') +		{ +		int dummy; + +			++expr; +			if ((chainedre=new Re) == 0) +				outofmem(); +			if ( chainedre->CompileS(expr, caseflag, dummy) < 0) +			{ +				expr += dummy; +				throw -1; +			} +			chainedre->prevre=this; +			if (VerboseLevel() > 7) +				merr.write("\n*** CHAINED TO ***\n"); + +		} else if (curchar())	throw -1; + +		final=*p=allocnode(); +		final->thechar=REFINAL; +	} +	catch (int n) +	{ +		init(); +		errindex=expr-origexpr; +		rc= n; +	} +	if (rc == 0 && VerboseLevel() > 7) +	{ +	RegExpNode *n; +	Buffer	b; + +		if (first) +		{ +			b="Start node: "; +			b.append( (unsigned long)first->id ); +			b += "\n\n"; +			b += '\0'; +			merr.write(b); +		} +		for (n=nodes; n; n=n->next) +		{ +			b="Node "; +			b.append( (unsigned long)n->id ); +			b += ": "; +			switch (n->thechar)	{ +			case RENULL: +				b  += "null"; +				break; +			case RESET: +				b += "[set] "; +				{ +				int i,j=0; + +					for (i=0; i<256; i=j) +					{ +						j=i+1; +						if ((n->reset[i/8] & +							(1 << (i % 8))) == 0) +							continue; +						for (j=i; j<256; j++) +							if ((n->reset[j/8] & +								(1 << (j % 8))) +								== 0) +							break; +						if (i < ' ' || i > 127) +						{ +							b += '#'; +							b.append((unsigned long) +								i); +						} +						else +						{ +							if (i == '#' +								|| i == '-' +								|| i == '\\') +								b += '\\'; +							b += (char)i; +						} +						if (i+1 == j)	continue; +						b += ('-'); +						--j; +					} +				} +				break; +			case REFINAL: +				b += "final"; +				break; +			default: +				if (n->thechar >= ' ' && n->thechar < 127) +				{ +					b += '\''; +					b += (char)n->thechar; +					b += '\''; +				} +				else +				{ +					b += "chr("; +					b.append((unsigned long)n->thechar); +					b += ')'; +				} +			} +			b += '\n'; +			b += '\0'; +			merr.write( b ); +			if (n->next1) +			{ +				b="    transition to "; +				b.append((unsigned long)n->next1->id); +				b += '\n'; +				b += '\0'; +				merr.write(b); +			} + +			if (n->next2) +			{ +				b="    transition to "; +				b.append((unsigned long)n->next2->id); +				b += '\n'; +				b += '\0'; +				merr.write(b); +			} +			merr.write("\n"); +		} +	} +	return (rc); +} + +RegExpNode **Re::CompileOrClause(RegExpNode **ptr) +{ +RegExpNode **finish=CompileAtomString(ptr); + +	if ( curchar() != '|')	return (finish); + +RegExpNode *realfinish=allocnode(); + +	realfinish->thechar=RENULL; +	*finish=realfinish; + +	while ( curchar() == '|' ) +	{ +		nextchar(); + +	RegExpNode *newstart=allocnode(); + +		newstart->thechar=RENULL; +		newstart->next1= *ptr; +		*ptr=newstart; + +		finish=CompileAtomString(&newstart->next2); +		*finish=realfinish; +	} +	return (&realfinish->next1); +} + +RegExpNode **Re::CompileAtomString(RegExpNode **ptr) +{ +int	c; + +	for (;;) +	{ +		c=curchar(); +		if (c == 0 || c == '|' || c == ')' || c == '!') +			break; +		ptr=CompileElement(ptr); +	} +	return (ptr); +} + +RegExpNode **Re::CompileElement(RegExpNode **start) +{ +RegExpNode **finish; + +	if (curchar() != '$') +	{ +		finish=CompileAtom(start); +	} +	else +	{ +		nextchar(); +		if (curchar() == 0) +		{ +			matchFull=1; +			return (start); +		} +		(*start)=allocnode(); +		(*start)->thechar='$'; +		finish= & (*start)->next1; +	} + +	switch (curchar())	{ +	case '+': +		(*finish)=allocnode(); +		(*finish)->thechar=RENULL; +		(*finish)->next1=(*start); +		finish= &(*finish)->next2; +		nextchar(); +		break; +	case '*': +		(*finish)=allocnode(); +		(*finish)->thechar=RENULL; +		(*finish)->next1=(*start); +		(*start)=(*finish); +		finish= &(*finish)->next2; +		nextchar(); +		break; +	case '?': + +		{ +		RegExpNode *newstart=allocnode(); + +			newstart->thechar=RENULL; +			(*finish)=allocnode(); +			(*finish)->thechar=RENULL; +			newstart->next1= *start; +			newstart->next2= *finish; +			*start=newstart; +			finish= &(*finish)->next1; +			nextchar(); +		} +		break; +	} +	return (finish); +} + +RegExpNode **Re::CompileAtom(RegExpNode **ptr) +{ +int	c=curchar(); + +	if (c == '(')	// Subexpression +	{ +		nextchar(); +		ptr=CompileOrClause(ptr); +		if ( curchar() != ')')	throw -1; +		nextchar(); +		return (ptr); +	} + +	(*ptr)=allocnode(); + +	if (c == '[' || c == '.') +	{ +	int	i, complement=0; + +		if ( ((*ptr)->reset=new unsigned char[256/8]) == 0) +			outofmem(); +		for (i=0; i<256/8; i++) +			(*ptr)->reset[i]=0; + +		if ( c == '.' ) +		{ +			(*ptr)->reset[ '\n' / 8 ] |= 1 << ('\n' % 8); +			complement=1; +		} +		else +		{ +			nextchar(); +			if ( curchar() == '^') +			{ +				complement=1; +				nextchar(); +			} + +			is_sets(*ptr); +		} +		nextchar(); +		if (complement) +			for (i=0; i<256/8; i++) +				(*ptr)->reset[i] ^= ~0; +		c=RESET; +	} +	else c=parsechar(); + +	(*ptr)->thechar=c; +	return (&(*ptr)->next1); +} + +void Re::is_sets(RegExpNode *p) +{ +Buffer	buf; +int	c=curchar(); +int	call_parsechar=1; + +	if (c == ':') +	{ +		do +		{ +			buf += c; +			nextchar(); +		} while ( (c=curchar()) >= 0 && isalpha(c)); + +		if (c == ':') +		{ +			buf += c; +			nextchar(); +			c=curchar(); +			if (c == ']') +			{ +				buf += '\0'; + +			const char *q=(const char *)buf; +			unsigned i; + +				for (i=0; i<sizeof(is_setname)/ +					sizeof(is_setname[0]); i++) +				{ +					if (strcmp(is_setname[i], q) == 0) +					{ +						(*is_setfunc[i])(p->reset); +						return; +					} +				} +			} +		} + +	int	i=0; + +		for (i=0; i<buf.Length(); i++) +		{ +			c=(int)(unsigned char)((const char*)buf)[i]; +			p->reset[ c / 8 ] |= 1 << (c % 8); +		} +		// In case the next character is '-', leave 'c' the way it +		// is. +		call_parsechar=0; +		if (curchar() == ']')	return; +	} + +	do +	{ +	int	c2; + +		if (c == 0)	throw -1; + +		if (c == '.') +		{ +			for (c2=0; c2 < 256/8; c2++) +				if (c2 != '\n' / 8) +					p->reset[c2]= ~0; +				else +					p->reset[c2] |= ~(1 << ('\n' % 8)); +			if (call_parsechar) +				nextchar(); +			call_parsechar=1; +			continue; +		} +		if (call_parsechar) +			c=parsechar(); +		c2=c; +		call_parsechar=1; + +		if (curchar() == '-') +		{ +			nextchar(); +			c2=parsechar(); +		} +		while ( c <= c2 ) +		{ +			p->reset[ c / 8 ] |= 1 << (c % 8); +			++c; +		} +	} while ((c=curchar()) != ']'); +} + +int Re::parsechar() +{ +int	c; + +	c=curchar(); +	if (c == 0)	throw -1; +	nextchar(); +	if (c != '\\') return (c); +	c=curchar(); + +	if (c == 0) +		throw -1; +	else if (c >= '0' && c <= '7') +	{ +	unsigned char uc=0; + +		while ( c >= '0' && c <= '7' ) +		{ +			uc = uc * 8 + (c-'0'); +			nextchar(); +			c=curchar(); +		} +		c=uc; +	} +	else +	{ +		c=backslash_char(c); +		nextchar(); +	} +	return (c); +} + +///////////////////////////////////////////////////////////////////////////// + +int Re::Match(ReMatch &string) +{ +	matched= -1; +	matchedpos=0; + +	charsmatched=0; +	state1.init(nextid); +	state2.init(nextid); + +	curstate= &state1; +	nextstate= &state2; + +	curstate->nodes[0]=first; +	curstate->numnodes=1; +	curstate->nodenums[first->id]=0; + +	final_id=final->id; + +	if (VerboseLevel() > 8) +	{ +		merr.write("*** MATCH START ***\n"); +	} + +	for (;;) +	{ +	// Compute null closure + +	unsigned n; + +		for (n=0; n<curstate->numnodes; n++) +		{ +		RegExpNode *p=curstate->nodes[n]; + +			if (p->thechar != RENULL)	continue; + +		RegExpNode *q=p->next1; + +			if (q && curstate->nodenums[q->id] != charsmatched) +			{ +				curstate->nodes[curstate->numnodes++]=q; +				curstate->nodenums[q->id]=charsmatched; +				if (VerboseLevel() > 8) +				{ +				Buffer b; + +					b="  Transition to state "; +					b.append((unsigned long)q->id); +					b += '\n'; +					b += '\0'; +					merr.write(b); +				} +			} + +			q=p->next2; +			if (q && curstate->nodenums[q->id] != charsmatched) +			{ +				curstate->nodes[curstate->numnodes++]=q; +				curstate->nodenums[q->id]=charsmatched; +				if (VerboseLevel() > 8) +				{ +				Buffer b; + +					b="  Transition to state "; +					b.append((unsigned long)q->id); +					b += '\n'; +					b += '\0'; +					merr.write(b); +				} +			} +		} + +	int	nextChar; + +		if (curstate->nodenums[final_id] == charsmatched) +		{ +		off_t	pos=string.GetCurrentPos(); + +			if (VerboseLevel() > 8) +				merr.write("**Final node.\n"); +			if (chainedre) +			{ +			unsigned long saved_matched_chainedre= +					chainedre->charsmatched; + +				// On subsequent passes, charsmatched gets +				// reset.  If, previously, we had a match, +				// don't forget # of characters matched! + +				if (VerboseLevel() > 8) +					merr.write( +					"**Final node - checking subexpr.\n"); +				if (chainedre->Match(string) == 0) +				{ +					if (VerboseLevel() > 8) +					{ +					Buffer	buf; + +						buf="**Subexpr matched after "; +						buf.append( (unsigned long) +							charsmatched); +						buf += " characters.\n"; +						buf += '\0'; +						merr.write(buf); +					} +					matched=0; +					matchedpos=charsmatched; +					if (isDummy)	// Don't need to +							// look for max matches +							// for the dummy block +					{ +						return (0); +					} +				} +				else +				{ +					if (VerboseLevel() > 8) +						merr.write( +						"**Subexpr didn't match.\n"); +					chainedre->charsmatched= +						saved_matched_chainedre; +				} +				string.SetCurrentPos(pos); +				nextChar=string.NextChar(); +			} +			else +			{ +				if (!matchFull)	// We don't need to match full +						// string. +				{ +					if (VerboseLevel() > 8) +					{ +					Buffer	buf; + +						buf="Matched "; +						buf.append( (unsigned long) +							charsmatched); +						buf += " characters.\n"; +						buf += '\0'; +						merr.write(buf); +					} +					matched=0; +					matchedpos=charsmatched; +				} + +				nextChar=string.NextChar(); +				if ( nextChar < 0) +				{ +					if (VerboseLevel() > 8) +					{ +					Buffer	buf; + +						buf="Matched "; +						buf.append( (unsigned long) +							charsmatched); +						buf += " characters.\n"; +						buf += '\0'; +						merr.write(buf); +					} +					return (0);	// Matched everything +				} +			} +		} +		else nextChar=string.NextChar(); + +		if (nextChar < 0) +		{ +			if (VerboseLevel() > 8) +				merr.write( +					"Failed - End of matching string.\n"); +			charsmatched=matchedpos; +			return (matched); +		} +		if (curstate->numnodes == 0)	// No sense to continue +		{ +			if (VerboseLevel() > 8) +				merr.write( +					"Failed - out of states.\n"); +			charsmatched=matchedpos; +			return (matched); +		} + +		if (VerboseLevel() > 8) +		{ +		Buffer	b; + +			b="Matching character: "; + +			if (nextChar <= ' ' || nextChar > 127) +			{ +				b += '#'; +				b.append((unsigned long)nextChar); +			} +			else	b += (char)nextChar; +			b += '\n'; +			b += '\0'; +			merr.write(b); +		} +		++charsmatched; + +		if (!casesensitive) +			nextChar=tolower(nextChar); + +		nextstate->numnodes=0; + +		for (n=0; n<curstate->numnodes; n++) +		{ +		RegExpNode *p=curstate->nodes[n]; + +			if (p->thechar == RESET) +			{ +				if ((p->reset[nextChar / 8] & +						(1 << (nextChar % 8))) == 0) +				{ +					if (casesensitive)	continue; + +				int	uchar=toupper(nextChar); +					if ((p->reset[uchar / 8] & +						(1 << (uchar % 8))) == 0) +							continue; +				} +			} +			else +			{ +				if (p->thechar != nextChar) +				{ +					if (casesensitive)	continue; +				int	uchar=toupper(nextChar); +					if (p->thechar != uchar) +						continue; +				} +			} + +		RegExpNode *q=p->next1; + +			if (q && nextstate->nodenums[q->id] != charsmatched) +			{ +				nextstate->nodes[nextstate->numnodes++]=q; +				nextstate->nodenums[q->id]=charsmatched; +				if (VerboseLevel() > 8) +				{ +				Buffer b; + +					b="  Transition to state "; +					b.append((unsigned long)q->id); +					b += '\n'; +					b += '\0'; +					merr.write(b); +				} +			} + +			q=p->next2; + +			if (q && nextstate->nodenums[q->id] != charsmatched) +			{ +				nextstate->nodes[nextstate->numnodes++]=q; +				nextstate->nodenums[q->id]=charsmatched; +				if (VerboseLevel() > 8) +				{ +				Buffer b; + +					b="  Transition to state "; +					b.append((unsigned long)q->id); +					b += '\n'; +					b += '\0'; +					merr.write(b); +				} +			} +		} + +	ReEval *swap=curstate; curstate=nextstate; nextstate=swap; +	} +} | 
