exp/regexp/syntax: syntax data structures, parser

Parser is a work in progress but can populate most of the interesting parts of the data structure, so a good checkpoint. All the complicated Perl syntax is missing, as are various important optimizations made during parsing to the syntax tree. The plan is that exp/regexp's API will mimic regexp, and exp/regexp/syntax provides the parser directly for programs that need it (and for implementing exp/regexp). Once finished, exp/regexp will replace regexp. R=r, sam.thorogood, kevlar, edsrzf CC=golang-dev https://golang.org/cl/4538123
2025-05-10 01:53:02 +00:00 · 2011-06-13 09:20:23 -04:00 · 2011-06-13 09:20:23 -04:00 · b96c3477f8
commit b96c3477f8
parent b4bb970e18
5 changed files with 1050 additions and 0 deletions
--- a/src/pkg/Makefile
+++ b/src/pkg/Makefile
@ -81,6 +81,7 @@ DIRS=\
 	exp/eval\
 	exp/gui\
 	exp/gui/x11\
 	exp/regexp/syntax\
 	expvar\
 	flag\
 	fmt\
--- a/src/pkg/exp/regexp/syntax/Makefile
+++ b/src/pkg/exp/regexp/syntax/Makefile
@ -0,0 +1,12 @@
 # Copyright 2011 The Go Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 include ../../../../Make.inc
 TARG=exp/regexp/syntax
 GOFILES=\
 	parse.go\
 	regexp.go\
 include ../../../../Make.pkg
--- a/src/pkg/exp/regexp/syntax/parse.go
+++ b/src/pkg/exp/regexp/syntax/parse.go
@ -0,0 +1,561 @@
 // Copyright 2011 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package syntax
 import (
 	"os"
 	"sort"
 	"unicode"
 	"utf8"
 )
 // An Error describes a failure to parse a regular expression
 // and gives the offending expression.
 type Error struct {
 	Code ErrorCode
 	Expr string
 }
 func (e *Error) String() string {
 	return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
 }
 // An ErrorCode describes a failure to parse a regular expression.
 type ErrorCode string
 const (
 	// Unexpected error
 	ErrInternalError ErrorCode = "regexp/syntax: internal error"
 	// Parse errors
 	ErrInvalidCharClass      ErrorCode = "invalid character class"
 	ErrInvalidCharRange      ErrorCode = "invalid character class range"
 	ErrInvalidEscape         ErrorCode = "invalid escape sequence"
 	ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
 	ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
 	ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
 	ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
 	ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
 	ErrMissingBracket        ErrorCode = "missing closing ]"
 	ErrMissingParen          ErrorCode = "missing closing )"
 	ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
 	ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
 )
 func (e ErrorCode) String() string {
 	return string(e)
 }
 // Flags control the behavior of the parser and record information about regexp context.
 type Flags uint16
 const (
 	FoldCase      Flags = 1 << iota // case-insensitive match
 	Literal                         // treat pattern as literal string
 	ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
 	DotNL                           // allow . to match newline
 	OneLine                         // treat ^ and $ as only matching at beginning and end of text
 	NonGreedy                       // make repetition operators default to non-greedy
 	PerlX                           // allow Perl extensions
 	UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
 	WasDollar                       // regexp OpEndText was $, not \z
 	Simple                          // regexp contains no counted repetition
 	MatchNL = ClassNL | DotNL
 	Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
 	POSIX Flags = 0                                         // POSIX syntax
 )
 // Pseudo-ops for parsing stack.
 const (
 	opLeftParen = opPseudo + iota
 	opVerticalBar
 )
 type parser struct {
 	flags       Flags     // parse mode flags
 	stack       []*Regexp // stack of parsed expressions
 	numCap      int       // number of capturing groups seen
 	wholeRegexp string
 }
 // Parse stack manipulation.
 // push pushes the regexp re onto the parse stack and returns the regexp.
 func (p *parser) push(re *Regexp) *Regexp {
 	// TODO: automatic concatenation
 	// TODO: turn character class into literal
 	// TODO: compute simple
 	p.stack = append(p.stack, re)
 	return re
 }
 // newLiteral returns a new OpLiteral Regexp with the given flags
 func newLiteral(r int, flags Flags) *Regexp {
 	re := &Regexp{
 		Op:    OpLiteral,
 		Flags: flags,
 	}
 	re.Rune0[0] = r
 	re.Rune = re.Rune0[:1]
 	return re
 }
 // literal pushes a literal regexp for the rune r on the stack
 // and returns that regexp.
 func (p *parser) literal(r int) *Regexp {
 	return p.push(newLiteral(r, p.flags))
 }
 // op pushes a regexp with the given op onto the stack
 // and returns that regexp.
 func (p *parser) op(op Op) *Regexp {
 	return p.push(&Regexp{Op: op, Flags: p.flags})
 }
 // repeat replaces the top stack element with itself repeated
 // according to op.
 func (p *parser) repeat(op Op, opstr string) os.Error {
 	n := len(p.stack)
 	if n == 0 {
 		return &Error{ErrMissingRepeatArgument, opstr}
 	}
 	sub := p.stack[n-1]
 	re := &Regexp{
 		Op: op,
 	}
 	re.Sub = re.Sub0[:1]
 	re.Sub[0] = sub
 	p.stack[n-1] = re
 	return nil
 }
 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
 func (p *parser) concat() *Regexp {
 	// TODO: Flatten concats.
 	// Scan down to find pseudo-operator | or (.
 	i := len(p.stack)
 	for i > 0 && p.stack[i-1].Op < opPseudo {
 		i--
 	}
 	sub := p.stack[i:]
 	p.stack = p.stack[:i]
 	var re *Regexp
 	switch len(sub) {
 	case 0:
 		re = &Regexp{Op: OpEmptyMatch}
 	case 1:
 		re = sub[0]
 	default:
 		re = &Regexp{Op: OpConcat}
 		re.Sub = append(re.Sub0[:0], sub...)
 	}
 	return p.push(re)
 }
 // alternate replaces the top of the stack (above the topmost '(') with its alternation.
 func (p *parser) alternate() *Regexp {
 	// TODO: Flatten alternates.
 	// Scan down to find pseudo-operator (.
 	// There are no | above (.
 	i := len(p.stack)
 	for i > 0 && p.stack[i-1].Op < opPseudo {
 		i--
 	}
 	sub := p.stack[i:]
 	p.stack = p.stack[:i]
 	var re *Regexp
 	switch len(sub) {
 	case 0:
 		re = &Regexp{Op: OpNoMatch}
 	case 1:
 		re = sub[0]
 	default:
 		re = &Regexp{Op: OpAlternate}
 		re.Sub = append(re.Sub0[:0], sub...)
 	}
 	return p.push(re)
 }
 // Parsing.
 func Parse(s string, flags Flags) (*Regexp, os.Error) {
 	if flags&Literal != 0 {
 		// Trivial parser for literal string.
 		if err := checkUTF8(s); err != nil {
 			return nil, err
 		}
 		re := &Regexp{
 			Op:    OpLiteral,
 			Flags: flags,
 		}
 		re.Rune = re.Rune0[:0] // use local storage for small strings
 		for _, c := range s {
 			if len(re.Rune) >= cap(re.Rune) {
 				// string is too long to fit in Rune0.  let Go handle it
 				re.Rune = []int(s)
 				break
 			}
 			re.Rune = append(re.Rune, c)
 		}
 		return re, nil
 	}
 	// Otherwise, must do real work.
 	var (
 		p   parser
 		err os.Error
 		c   int
 		op  Op
 	)
 	p.flags = flags
 	p.wholeRegexp = s
 	t := s
 	for t != "" {
 		switch t[0] {
 		default:
 			if c, t, err = nextRune(t); err != nil {
 				return nil, err
 			}
 			p.literal(c)
 		case '(':
 			// TODO: Actual Perl flag parsing.
 			if len(t) >= 3 && t[1] == '?' && t[2] == ':' {
 				// non-capturing paren
 				p.op(opLeftParen)
 				t = t[3:]
 				break
 			}
 			p.numCap++
 			p.op(opLeftParen).Cap = p.numCap
 			t = t[1:]
 		case '|':
 			p.concat()
 			if err = p.parseVerticalBar(); err != nil {
 				return nil, err
 			}
 			t = t[1:]
 		case ')':
 			if err = p.parseRightParen(); err != nil {
 				return nil, err
 			}
 			t = t[1:]
 		case '^':
 			if p.flags&OneLine != 0 {
 				p.op(OpBeginText)
 			} else {
 				p.op(OpBeginLine)
 			}
 			t = t[1:]
 		case '$':
 			if p.flags&OneLine != 0 {
 				p.op(OpEndText).Flags |= WasDollar
 			} else {
 				p.op(OpEndLine)
 			}
 			t = t[1:]
 		case '.':
 			if p.flags&DotNL != 0 {
 				p.op(OpAnyChar)
 			} else {
 				p.op(OpAnyCharNotNL)
 			}
 			t = t[1:]
 		case '[':
 			if t, err = p.parseClass(t); err != nil {
 				return nil, err
 			}
 		case '*', '+', '?':
 			switch t[0] {
 			case '*':
 				op = OpStar
 			case '+':
 				op = OpPlus
 			case '?':
 				op = OpQuest
 			}
 			// TODO: greedy
 			if err = p.repeat(op, t[0:1]); err != nil {
 				return nil, err
 			}
 			t = t[1:]
 		case '{':
 			return nil, os.NewError("repeat not implemented")
 		case '\\':
 			return nil, os.NewError("escape not implemented")
 		}
 	}
 	p.concat()
 	if p.swapVerticalBar() {
 		// pop vertical bar
 		p.stack = p.stack[:len(p.stack)-1]
 	}
 	p.alternate()
 	n := len(p.stack)
 	if n != 1 {
 		return nil, &Error{ErrMissingParen, s}
 	}
 	return p.stack[0], nil
 }
 // parseVerticalBar handles a | in the input.
 func (p *parser) parseVerticalBar() os.Error {
 	p.concat()
 	// The concatenation we just parsed is on top of the stack.
 	// If it sits above an opVerticalBar, swap it below
 	// (things below an opVerticalBar become an alternation).
 	// Otherwise, push a new vertical bar.
 	if !p.swapVerticalBar() {
 		p.op(opVerticalBar)
 	}
 	return nil
 }
 // If the top of the stack is an element followed by an opVerticalBar
 // swapVerticalBar swaps the two and returns true.
 // Otherwise it returns false.
 func (p *parser) swapVerticalBar() bool {
 	if n := len(p.stack); n >= 2 {
 		re1 := p.stack[n-1]
 		re2 := p.stack[n-2]
 		if re2.Op == opVerticalBar {
 			p.stack[n-2] = re1
 			p.stack[n-1] = re2
 			return true
 		}
 	}
 	return false
 }
 // parseRightParen handles a ) in the input.
 func (p *parser) parseRightParen() os.Error {
 	p.concat()
 	if p.swapVerticalBar() {
 		// pop vertical bar
 		p.stack = p.stack[:len(p.stack)-1]
 	}
 	p.alternate()
 	n := len(p.stack)
 	if n < 2 {
 		return &Error{ErrInternalError, ""}
 	}
 	re1 := p.stack[n-1]
 	re2 := p.stack[n-2]
 	p.stack = p.stack[:n-2]
 	if re2.Op != opLeftParen {
 		return &Error{ErrMissingParen, p.wholeRegexp}
 	}
 	if re2.Cap == 0 {
 		// Just for grouping.
 		p.push(re1)
 	} else {
 		re2.Op = OpCapture
 		re2.Sub = re2.Sub0[:1]
 		re2.Sub[0] = re1
 		p.push(re2)
 	}
 	return nil
 }
 // parseClassChar parses a character class character at the beginning of s
 // and returns it.
 func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) {
 	if s == "" {
 		return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
 	}
 	// TODO: Escapes
 	return nextRune(s)
 }
 // parseClass parses a character class at the beginning of s
 // and pushes it onto the parse stack.
 func (p *parser) parseClass(s string) (rest string, err os.Error) {
 	t := s[1:] // chop [
 	re := &Regexp{Op: OpCharClass, Flags: p.flags}
 	re.Rune = re.Rune0[:0]
 	sign := +1
 	if t != "" && t[0] == '^' {
 		sign = -1
 		t = t[1:]
 		// If character class does not match \n, add it here,
 		// so that negation later will do the right thing.
 		if p.flags&ClassNL == 0 {
 			re.Rune = append(re.Rune, '\n', '\n')
 		}
 	}
 	class := re.Rune
 	first := true // ] and - are okay as first char in class
 	for t == "" || t[0] != ']' || first {
 		// POSIX: - is only okay unescaped as first or last in class.
 		// Perl: - is okay anywhere.
 		if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
 			_, size := utf8.DecodeRuneInString(t[1:])
 			return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
 		}
 		first = false
 		// TODO: Look for [:alnum:]
 		// TODO: Look for Unicode group.
 		// TODO: Look for Perl group.
 		// Single character or simple range.
 		rng := t
 		var lo, hi int
 		if lo, t, err = p.parseClassChar(t, s); err != nil {
 			return "", err
 		}
 		hi = lo
 		// [a-] means (a|-) so check for final ].
 		if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
 			t = t[1:]
 			if hi, t, err = p.parseClassChar(t, s); err != nil {
 				return "", err
 			}
 			if hi < lo {
 				rng = rng[:len(rng)-len(t)]
 				return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
 			}
 		}
 		// Expand last range if overlaps or abuts.
 		if n := len(class); n > 0 {
 			clo, chi := class[n-2], class[n-1]
 			if lo <= chi+1 && clo <= hi+1 {
 				if lo < clo {
 					class[n-2] = lo
 				}
 				if hi > chi {
 					class[n-1] = hi
 				}
 				continue
 			}
 		}
 		class = append(class, lo, hi)
 	}
 	t = t[1:] // chop ]
 	// Use &re.Rune instead of &class to avoid allocation.
 	re.Rune = class
 	class = cleanClass(&re.Rune)
 	if sign < 0 {
 		class = negateClass(class)
 	}
 	re.Rune = class
 	p.push(re)
 	return t, nil
 }
 // cleanClass sorts the ranges (pairs of elements of r),
 // merges them, and eliminates duplicates.
 func cleanClass(rp *[]int) []int {
 	// Sort by lo increasing, hi decreasing to break ties.
 	sort.Sort(ranges{rp})
 	r := *rp
 	// Merge abutting, overlapping.
 	w := 2 // write index
 	for i := 2; i < len(r); i += 2 {
 		lo, hi := r[i], r[i+1]
 		if lo <= r[w-1]+1 {
 			// merge with previous range
 			if hi > r[w-1] {
 				r[w-1] = hi
 			}
 			continue
 		}
 		// new disjoint range
 		r[w] = lo
 		r[w+1] = hi
 		w += 2
 	}
 	return r[:w]
 }
 // negateClass overwrites r and returns r's negation.
 // It assumes the class r is already clean.
 func negateClass(r []int) []int {
 	nextLo := 0 // lo end of next class to add
 	w := 0      // write index
 	for i := 0; i < len(r); i += 2 {
 		lo, hi := r[i], r[i+1]
 		if nextLo <= lo-1 {
 			r[w] = nextLo
 			r[w+1] = lo - 1
 			w += 2
 		}
 		nextLo = hi + 1
 	}
 	if nextLo <= unicode.MaxRune {
 		// It's possible for the negation to have one more
 		// range - this one - than the original class, so use append.
 		r = append(r[:w], nextLo, unicode.MaxRune)
 	}
 	return r
 }
 // ranges implements sort.Interface on a []rune.
 // The choice of receiver type definition is strange
 // but avoids an allocation since we already have
 // a *[]int.
 type ranges struct {
 	p *[]int
 }
 func (ra ranges) Less(i, j int) bool {
 	p := *ra.p
 	i *= 2
 	j *= 2
 	return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
 }
 func (ra ranges) Len() int {
 	return len(*ra.p) / 2
 }
 func (ra ranges) Swap(i, j int) {
 	p := *ra.p
 	i *= 2
 	j *= 2
 	p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
 }
 func checkUTF8(s string) os.Error {
 	for s != "" {
 		rune, size := utf8.DecodeRuneInString(s)
 		if rune == utf8.RuneError && size == 1 {
 			return &Error{Code: ErrInvalidUTF8, Expr: s}
 		}
 		s = s[size:]
 	}
 	return nil
 }
 func nextRune(s string) (c int, t string, err os.Error) {
 	c, size := utf8.DecodeRuneInString(s)
 	if c == utf8.RuneError && size == 1 {
 		return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
 	}
 	return c, s[size:], nil
 }
--- a/src/pkg/exp/regexp/syntax/parse_test.go
+++ b/src/pkg/exp/regexp/syntax/parse_test.go
@ -0,0 +1,266 @@
 // Copyright 2011 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package syntax
 import (
 	"bytes"
 	"fmt"
 	"testing"
 	"unicode"
 )
 var parseTests = []struct {
 	Regexp string
 	Dump   string
 }{
 	// Base cases
 	{"a", "lit{a}"},
 	{"a.", "cat{lit{a}dot{}}"},
 	{"a.b", "cat{lit{a}dot{}lit{b}}"},
 	//	{ "ab", "str{ab}" },
 	{"ab", "cat{lit{a}lit{b}}"},
 	{"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"},
 	//	{ "abc", "str{abc}" },
 	{"abc", "cat{lit{a}lit{b}lit{c}}"},
 	{"a|^", "alt{lit{a}bol{}}"},
 	//	{ "a|b", "cc{0x61-0x62}" },
 	{"a|b", "alt{lit{a}lit{b}}"},
 	{"(a)", "cap{lit{a}}"},
 	{"(a)|b", "alt{cap{lit{a}}lit{b}}"},
 	{"a*", "star{lit{a}}"},
 	{"a+", "plus{lit{a}}"},
 	{"a?", "que{lit{a}}"},
 	//	{ "a{2}", "rep{2,2 lit{a}}" },
 	//	{ "a{2,3}", "rep{2,3 lit{a}}" },
 	//	{ "a{2,}", "rep{2,-1 lit{a}}" },
 	//	{ "a*?", "nstar{lit{a}}" },
 	//	{ "a+?", "nplus{lit{a}}" },
 	//	{ "a??", "nque{lit{a}}" },
 	//	{ "a{2}?", "nrep{2,2 lit{a}}" },
 	//	{ "a{2,3}?", "nrep{2,3 lit{a}}" },
 	//	{ "a{2,}?", "nrep{2,-1 lit{a}}" },
 	{"", "emp{}"},
 	//	{ "|", "emp{}" },  // alt{emp{}emp{}} but got factored
 	//	{ "|", "alt{emp{}emp{}}" },
 	{"|x|", "alt{emp{}lit{x}emp{}}"},
 	{".", "dot{}"},
 	{"^", "bol{}"},
 	{"$", "eol{}"},
 	//	{ "\\|", "lit{|}" },
 	//	{ "\\(", "lit{(}" },
 	//	{ "\\)", "lit{)}" },
 	//	{ "\\*", "lit{*}" },
 	//	{ "\\+", "lit{+}" },
 	//	{ "\\?", "lit{?}" },
 	//	{ "{", "lit{{}" },
 	{"}", "lit{}}"},
 	//	{ "\\.", "lit{.}" },
 	//	{ "\\^", "lit{^}" },
 	//	{ "\\$", "lit{$}" },
 	//	{ "\\\\", "lit{\\}" },
 	{"[ace]", "cc{0x61 0x63 0x65}"},
 	{"[abc]", "cc{0x61-0x63}"},
 	{"[a-z]", "cc{0x61-0x7a}"},
 	//	{ "[a]", "lit{a}" },
 	{"[a]", "cc{0x61}"},
 	//	{ "\\-", "lit{-}" },
 	{"-", "lit{-}"},
 	//	{ "\\_", "lit{_}" },
 	// Posix and Perl extensions
 	//	{ "[[:lower:]]", "cc{0x61-0x7a}" },
 	//	{ "[a-z]", "cc{0x61-0x7a}" },
 	//	{ "[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
 	//	{ "[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
 	//	{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
 	//	{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
 	//	{ "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
 	//	{ "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
 	//	{ "\\d", "cc{0x30-0x39}" },
 	//	{ "\\D", "cc{0x0-0x2f 0x3a-0x10ffff}" },
 	//	{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
 	//	{ "\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
 	//	{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
 	//	{ "\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
 	//	{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
 	//	{ "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
 	//	{ "[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}" },
 	//	{ "\\C", "byte{}" },
 	// Unicode, negatives, and a double negative.
 	//	{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
 	//	{ "\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
 	//	{ "\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
 	//	{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },
 	// More interesting regular expressions.
 	//	{ "a{,2}", "str{a{,2}}" },
 	//	{ "\\.\\^\\$\\\\", "str{.^$\\}" },
 	{"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"},
 	{"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"},
 	{"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8
 	//	{ "a*{", "cat{star{lit{a}}lit{{}}" },
 	// Test precedences
 	//	{ "(?:ab)*", "star{str{ab}}" },
 	//	{ "(ab)*", "star{cap{str{ab}}}" },
 	//	{ "ab|cd", "alt{str{ab}str{cd}}" },
 	//	{ "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
 	{"(?:ab)*", "star{cat{lit{a}lit{b}}}"},
 	{"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"},
 	{"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"},
 	{"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"},
 	// Test flattening.
 	//	{ "(?:a)", "lit{a}" },
 	//	{ "(?:ab)(?:cd)", "str{abcd}" },
 	//	{ "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
 	//	{ "a|.", "dot{}" },
 	//	{ ".|a", "dot{}" },
 	// Test Perl quoted literals
 	//	{ "\\Q+|*?{[\\E", "str{+|*?{[}" },
 	//	{ "\\Q+\\E+", "plus{lit{+}}" },
 	//	{ "\\Q\\\\E", "lit{\\}" },
 	//	{ "\\Q\\\\\\E", "str{\\\\}" },
 	// Test Perl \A and \z
 	//	{ "(?m)^", "bol{}" },
 	//	{ "(?m)$", "eol{}" },
 	//	{ "(?-m)^", "bot{}" },
 	//	{ "(?-m)$", "eot{}" },
 	//	{ "(?m)\\A", "bot{}" },
 	//	{ "(?m)\\z", "eot{\\z}" },
 	//	{ "(?-m)\\A", "bot{}" },
 	//	{ "(?-m)\\z", "eot{\\z}" },
 	// Test named captures
 	//	{ "(?P<name>a)", "cap{name:lit{a}}" },
 	// Case-folded literals
 	//	{ "[Aa]", "litfold{a}" },
 	// Strings
 	//	{ "abcde", "str{abcde}" },
 	//	{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
 }
 const testFlags = MatchNL | PerlX | UnicodeGroups
 // Test Parse -> Dump.
 func TestParseDump(t *testing.T) {
 	for _, tt := range parseTests {
 		re, err := Parse(tt.Regexp, testFlags)
 		if err != nil {
 			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
 			continue
 		}
 		d := dump(re)
 		if d != tt.Dump {
 			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
 		}
 	}
 }
 // dump prints a string representation of the regexp showing
 // the structure explicitly.
 func dump(re *Regexp) string {
 	var b bytes.Buffer
 	dumpRegexp(&b, re)
 	return b.String()
 }
 var opNames = []string{
 	OpNoMatch:        "no",
 	OpEmptyMatch:     "emp",
 	OpLiteral:        "lit",
 	OpCharClass:      "cc",
 	OpAnyCharNotNL:   "dnl",
 	OpAnyChar:        "dot",
 	OpBeginLine:      "bol",
 	OpEndLine:        "eol",
 	OpBeginText:      "bot",
 	OpEndText:        "eot",
 	OpWordBoundary:   "wb",
 	OpNoWordBoundary: "nwb",
 	OpCapture:        "cap",
 	OpStar:           "star",
 	OpPlus:           "plus",
 	OpQuest:          "que",
 	OpRepeat:         "rep",
 	OpConcat:         "cat",
 	OpAlternate:      "alt",
 }
 // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
 // It is used during testing to distinguish between parses that might print
 // the same using re's String method.
 func dumpRegexp(b *bytes.Buffer, re *Regexp) {
 	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
 		fmt.Fprintf(b, "op%d", re.Op)
 	} else {
 		switch re.Op {
 		default:
 			b.WriteString(opNames[re.Op])
 		case OpStar, OpPlus, OpQuest, OpRepeat:
 			if re.Flags&NonGreedy != 0 {
 				b.WriteByte('n')
 			}
 			b.WriteString(opNames[re.Op])
 		case OpLiteral:
 			if len(re.Rune) > 1 {
 				b.WriteString("str")
 			} else {
 				b.WriteString("lit")
 			}
 			if re.Flags&FoldCase != 0 {
 				for _, r := range re.Rune {
 					if unicode.ToUpper(r) != r {
 						b.WriteString("fold")
 					}
 				}
 			}
 		}
 	}
 	b.WriteByte('{')
 	switch re.Op {
 	case OpEndText:
 		if re.Flags&WasDollar == 0 {
 			b.WriteString(`\z`)
 		}
 	case OpLiteral:
 		for _, r := range re.Rune {
 			b.WriteRune(r)
 		}
 	case OpConcat, OpAlternate:
 		for _, sub := range re.Sub {
 			dumpRegexp(b, sub)
 		}
 	case OpStar, OpPlus, OpQuest:
 		dumpRegexp(b, re.Sub[0])
 	case OpRepeat:
 		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
 		dumpRegexp(b, re.Sub[0])
 	case OpCapture:
 		if re.Name != "" {
 			b.WriteString(re.Name)
 			b.WriteByte(':')
 		}
 		dumpRegexp(b, re.Sub[0])
 	case OpCharClass:
 		sep := ""
 		for i := 0; i < len(re.Rune); i += 2 {
 			b.WriteString(sep)
 			sep = " "
 			lo, hi := re.Rune[i], re.Rune[i+1]
 			if lo == hi {
 				fmt.Fprintf(b, "%#x", lo)
 			} else {
 				fmt.Fprintf(b, "%#x-%#x", lo, hi)
 			}
 		}
 	}
 	b.WriteByte('}')
 }
--- a/src/pkg/exp/regexp/syntax/regexp.go
+++ b/src/pkg/exp/regexp/syntax/regexp.go
@ -0,0 +1,210 @@
 // Copyright 2011 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package syntax parses regular expressions into syntax trees.
 // WORK IN PROGRESS.
 package syntax
 // Note to implementers:
 // In this package, re is always a *Regexp and r is always a rune.
 import (
 	"bytes"
 	"strconv"
 	"strings"
 	"unicode"
 )
 // A Regexp is a node in a regular expression syntax tree.
 type Regexp struct {
 	Op       Op // operator
 	Flags    Flags
 	Sub      []*Regexp  // subexpressions, if any
 	Sub0     [1]*Regexp // storage for short Sub
 	Rune     []int      // matched runes, for OpLiteral, OpCharClass
 	Rune0    [2]int     // storage for short Rune
 	Min, Max int        // min, max for OpRepeat
 	Cap      int        // capturing index, for OpCapture
 	Name     string     // capturing name, for OpCapture
 }
 // An Op is a single regular expression operator.
 type Op uint8
 // Operators are listed in precedence order, tightest binding to weakest.
 const (
 	OpNoMatch        Op = 1 + iota // matches no strings
 	OpEmptyMatch                   // matches empty string
 	OpLiteral                      // matches Runes sequence
 	OpCharClass                    // matches Runes interpreted as range pair list
 	OpAnyCharNotNL                 // matches any character
 	OpAnyChar                      // matches any character
 	OpBeginLine                    // matches empty string at beginning of line
 	OpEndLine                      // matches empty string at end of line
 	OpBeginText                    // matches empty string at beginning of text
 	OpEndText                      // matches empty string at end of text
 	OpWordBoundary                 // matches word boundary `\b`
 	OpNoWordBoundary               // matches word non-boundary `\B`
 	OpCapture                      // capturing subexpression with index Cap, optional name Name
 	OpStar                         // matches Sub[0] zero or more times
 	OpPlus                         // matches Sub[0] one or more times
 	OpQuest                        // matches Sub[0] zero or one times
 	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
 	OpConcat                       // matches concatenation of Subs
 	OpAlternate                    // matches alternation of Subs
 )
 const opPseudo Op = 128 // where pseudo-ops start
 // writeRegexp writes the Perl syntax for the regular expression re to b.
 func writeRegexp(b *bytes.Buffer, re *Regexp) {
 	switch re.Op {
 	default:
 		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
 	case OpNoMatch:
 		b.WriteString(`[^\x00-\x{10FFFF}]`)
 	case OpEmptyMatch:
 		b.WriteString(`(?:)`)
 	case OpLiteral:
 		for _, r := range re.Rune {
 			escape(b, r, false)
 		}
 	case OpCharClass:
 		if len(re.Rune)%2 != 0 {
 			b.WriteString(`[invalid char class]`)
 			break
 		}
 		b.WriteRune('[')
 		if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
 			// Contains 0 and MaxRune.  Probably a negated class.
 			// Print the gaps.
 			b.WriteRune('^')
 			for i := 1; i < len(re.Rune)-1; i += 2 {
 				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
 				escape(b, lo, lo == '-')
 				if lo != hi {
 					b.WriteRune('-')
 					escape(b, hi, hi == '-')
 				}
 			}
 		} else {
 			for i := 0; i < len(re.Rune); i += 2 {
 				lo, hi := re.Rune[i], re.Rune[i+1]
 				escape(b, lo, lo == '-')
 				if lo != hi {
 					b.WriteRune('-')
 					escape(b, hi, hi == '-')
 				}
 			}
 		}
 		b.WriteRune(']')
 	case OpAnyCharNotNL:
 		b.WriteString(`[^\n]`)
 	case OpAnyChar:
 		b.WriteRune('.')
 	case OpBeginLine:
 		b.WriteRune('^')
 	case OpEndLine:
 		b.WriteRune('$')
 	case OpBeginText:
 		b.WriteString(`\A`)
 	case OpEndText:
 		b.WriteString(`\z`)
 	case OpWordBoundary:
 		b.WriteString(`\b`)
 	case OpNoWordBoundary:
 		b.WriteString(`\B`)
 	case OpCapture:
 		if re.Name != "" {
 			b.WriteString(`(?P<`)
 			b.WriteString(re.Name)
 			b.WriteRune('>')
 		} else {
 			b.WriteRune('(')
 		}
 		writeRegexp(b, re.Sub[0])
 		b.WriteRune(')')
 	case OpStar, OpPlus, OpQuest, OpRepeat:
 		if sub := re.Sub[0]; sub.Op > OpCapture {
 			b.WriteString(`(?:`)
 			writeRegexp(b, sub)
 			b.WriteString(`)`)
 		} else {
 			writeRegexp(b, sub)
 		}
 		switch re.Op {
 		case OpStar:
 			b.WriteRune('*')
 		case OpPlus:
 			b.WriteRune('+')
 		case OpQuest:
 			b.WriteRune('?')
 		case OpRepeat:
 			b.WriteRune('{')
 			b.WriteString(strconv.Itoa(re.Min))
 			if re.Max != re.Min {
 				b.WriteRune(',')
 				if re.Max >= 0 {
 					b.WriteString(strconv.Itoa(re.Max))
 				}
 			}
 			b.WriteRune('}')
 		}
 	case OpConcat:
 		for _, sub := range re.Sub {
 			if sub.Op == OpAlternate {
 				b.WriteString(`(?:`)
 				writeRegexp(b, sub)
 				b.WriteString(`)`)
 			} else {
 				writeRegexp(b, sub)
 			}
 		}
 	case OpAlternate:
 		for i, sub := range re.Sub {
 			if i > 0 {
 				b.WriteRune('|')
 			}
 			writeRegexp(b, sub)
 		}
 	}
 }
 func (re *Regexp) String() string {
 	var b bytes.Buffer
 	writeRegexp(&b, re)
 	return b.String()
 }
 const meta = `\.+*?()|[]{}^$`
 func escape(b *bytes.Buffer, r int, force bool) {
 	if unicode.IsPrint(r) {
 		if strings.IndexRune(meta, r) >= 0 || force {
 			b.WriteRune('\\')
 		}
 		b.WriteRune(r)
 		return
 	}
 	switch r {
 	case '\a':
 		b.WriteString(`\a`)
 	case '\f':
 		b.WriteString(`\f`)
 	case '\n':
 		b.WriteString(`\n`)
 	case '\r':
 		b.WriteString(`\r`)
 	case '\t':
 		b.WriteString(`\t`)
 	case '\v':
 		b.WriteString(`\v`)
 	default:
 		b.WriteString(`\x{`)
 		b.WriteString(strconv.Itob(r, 16))
 		b.WriteString(`}`)
 	}
 }