1
2
3
4
5 package syntax
6
7 import (
8 "sort"
9 "strings"
10 "unicode"
11 "unicode/utf8"
12 )
13
14
15
16 type Error struct {
17 Code ErrorCode
18 Expr string
19 }
20
21 func (e *Error) Error() string {
22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
23 }
24
25
26 type ErrorCode string
27
28 const (
29
30 ErrInternalError ErrorCode = "regexp/syntax: internal error"
31
32
33 ErrInvalidCharClass ErrorCode = "invalid character class"
34 ErrInvalidCharRange ErrorCode = "invalid character class range"
35 ErrInvalidEscape ErrorCode = "invalid escape sequence"
36 ErrInvalidNamedCapture ErrorCode = "invalid named capture"
37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
41 ErrMissingBracket ErrorCode = "missing closing ]"
42 ErrMissingParen ErrorCode = "missing closing )"
43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
45 ErrUnexpectedParen ErrorCode = "unexpected )"
46 )
47
48 func (e ErrorCode) String() string {
49 return string(e)
50 }
51
52
53 type Flags uint16
54
55 const (
56 FoldCase Flags = 1 << iota
57 Literal
58 ClassNL
59 DotNL
60 OneLine
61 NonGreedy
62 PerlX
63 UnicodeGroups
64 WasDollar
65 Simple
66
67 MatchNL = ClassNL | DotNL
68
69 Perl = ClassNL | OneLine | PerlX | UnicodeGroups
70 POSIX Flags = 0
71 )
72
73
74 const (
75 opLeftParen = opPseudo + iota
76 opVerticalBar
77 )
78
79 type parser struct {
80 flags Flags
81 stack []*Regexp
82 free *Regexp
83 numCap int
84 wholeRegexp string
85 tmpClass []rune
86 }
87
88 func (p *parser) newRegexp(op Op) *Regexp {
89 re := p.free
90 if re != nil {
91 p.free = re.Sub0[0]
92 *re = Regexp{}
93 } else {
94 re = new(Regexp)
95 }
96 re.Op = op
97 return re
98 }
99
100 func (p *parser) reuse(re *Regexp) {
101 re.Sub0[0] = p.free
102 p.free = re
103 }
104
105
106
107
108 func (p *parser) push(re *Regexp) *Regexp {
109 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
110
111 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
112 return nil
113 }
114 re.Op = OpLiteral
115 re.Rune = re.Rune[:1]
116 re.Flags = p.flags &^ FoldCase
117 } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
118 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
119 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
120 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
121 re.Op == OpCharClass && len(re.Rune) == 2 &&
122 re.Rune[0]+1 == re.Rune[1] &&
123 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
124 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
125
126 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
127 return nil
128 }
129
130
131 re.Op = OpLiteral
132 re.Rune = re.Rune[:1]
133 re.Flags = p.flags | FoldCase
134 } else {
135
136 p.maybeConcat(-1, 0)
137 }
138
139 p.stack = append(p.stack, re)
140 return re
141 }
142
143
144
145
146
147
148
149
150
151
152 func (p *parser) maybeConcat(r rune, flags Flags) bool {
153 n := len(p.stack)
154 if n < 2 {
155 return false
156 }
157
158 re1 := p.stack[n-1]
159 re2 := p.stack[n-2]
160 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
161 return false
162 }
163
164
165 re2.Rune = append(re2.Rune, re1.Rune...)
166
167
168 if r >= 0 {
169 re1.Rune = re1.Rune0[:1]
170 re1.Rune[0] = r
171 re1.Flags = flags
172 return true
173 }
174
175 p.stack = p.stack[:n-1]
176 p.reuse(re1)
177 return false
178 }
179
180
181 func (p *parser) literal(r rune) {
182 re := p.newRegexp(OpLiteral)
183 re.Flags = p.flags
184 if p.flags&FoldCase != 0 {
185 r = minFoldRune(r)
186 }
187 re.Rune0[0] = r
188 re.Rune = re.Rune0[:1]
189 p.push(re)
190 }
191
192
193 func minFoldRune(r rune) rune {
194 if r < minFold || r > maxFold {
195 return r
196 }
197 min := r
198 r0 := r
199 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
200 if min > r {
201 min = r
202 }
203 }
204 return min
205 }
206
207
208
209 func (p *parser) op(op Op) *Regexp {
210 re := p.newRegexp(op)
211 re.Flags = p.flags
212 return p.push(re)
213 }
214
215
216
217
218
219 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
220 flags := p.flags
221 if p.flags&PerlX != 0 {
222 if len(after) > 0 && after[0] == '?' {
223 after = after[1:]
224 flags ^= NonGreedy
225 }
226 if lastRepeat != "" {
227
228
229
230 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
231 }
232 }
233 n := len(p.stack)
234 if n == 0 {
235 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
236 }
237 sub := p.stack[n-1]
238 if sub.Op >= opPseudo {
239 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
240 }
241
242 re := p.newRegexp(op)
243 re.Min = min
244 re.Max = max
245 re.Flags = flags
246 re.Sub = re.Sub0[:1]
247 re.Sub[0] = sub
248 p.stack[n-1] = re
249
250 if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
251 return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
252 }
253
254 return after, nil
255 }
256
257
258
259
260
261
262
263
264
265
266 func repeatIsValid(re *Regexp, n int) bool {
267 if re.Op == OpRepeat {
268 m := re.Max
269 if m == 0 {
270 return true
271 }
272 if m < 0 {
273 m = re.Min
274 }
275 if m > n {
276 return false
277 }
278 if m > 0 {
279 n /= m
280 }
281 }
282 for _, sub := range re.Sub {
283 if !repeatIsValid(sub, n) {
284 return false
285 }
286 }
287 return true
288 }
289
290
291 func (p *parser) concat() *Regexp {
292 p.maybeConcat(-1, 0)
293
294
295 i := len(p.stack)
296 for i > 0 && p.stack[i-1].Op < opPseudo {
297 i--
298 }
299 subs := p.stack[i:]
300 p.stack = p.stack[:i]
301
302
303 if len(subs) == 0 {
304 return p.push(p.newRegexp(OpEmptyMatch))
305 }
306
307 return p.push(p.collapse(subs, OpConcat))
308 }
309
310
311 func (p *parser) alternate() *Regexp {
312
313
314 i := len(p.stack)
315 for i > 0 && p.stack[i-1].Op < opPseudo {
316 i--
317 }
318 subs := p.stack[i:]
319 p.stack = p.stack[:i]
320
321
322
323 if len(subs) > 0 {
324 cleanAlt(subs[len(subs)-1])
325 }
326
327
328
329 if len(subs) == 0 {
330 return p.push(p.newRegexp(OpNoMatch))
331 }
332
333 return p.push(p.collapse(subs, OpAlternate))
334 }
335
336
337 func cleanAlt(re *Regexp) {
338 switch re.Op {
339 case OpCharClass:
340 re.Rune = cleanClass(&re.Rune)
341 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
342 re.Rune = nil
343 re.Op = OpAnyChar
344 return
345 }
346 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
347 re.Rune = nil
348 re.Op = OpAnyCharNotNL
349 return
350 }
351 if cap(re.Rune)-len(re.Rune) > 100 {
352
353
354 re.Rune = append(re.Rune0[:0], re.Rune...)
355 }
356 }
357 }
358
359
360
361
362
363 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
364 if len(subs) == 1 {
365 return subs[0]
366 }
367 re := p.newRegexp(op)
368 re.Sub = re.Sub0[:0]
369 for _, sub := range subs {
370 if sub.Op == op {
371 re.Sub = append(re.Sub, sub.Sub...)
372 p.reuse(sub)
373 } else {
374 re.Sub = append(re.Sub, sub)
375 }
376 }
377 if op == OpAlternate {
378 re.Sub = p.factor(re.Sub)
379 if len(re.Sub) == 1 {
380 old := re
381 re = re.Sub[0]
382 p.reuse(old)
383 }
384 }
385 return re
386 }
387
388
389
390
391
392
393
394
395
396
397
398
399 func (p *parser) factor(sub []*Regexp) []*Regexp {
400 if len(sub) < 2 {
401 return sub
402 }
403
404
405 var str []rune
406 var strflags Flags
407 start := 0
408 out := sub[:0]
409 for i := 0; i <= len(sub); i++ {
410
411
412
413
414
415
416 var istr []rune
417 var iflags Flags
418 if i < len(sub) {
419 istr, iflags = p.leadingString(sub[i])
420 if iflags == strflags {
421 same := 0
422 for same < len(str) && same < len(istr) && str[same] == istr[same] {
423 same++
424 }
425 if same > 0 {
426
427
428 str = str[:same]
429 continue
430 }
431 }
432 }
433
434
435
436
437
438
439 if i == start {
440
441 } else if i == start+1 {
442
443 out = append(out, sub[start])
444 } else {
445
446 prefix := p.newRegexp(OpLiteral)
447 prefix.Flags = strflags
448 prefix.Rune = append(prefix.Rune[:0], str...)
449
450 for j := start; j < i; j++ {
451 sub[j] = p.removeLeadingString(sub[j], len(str))
452 }
453 suffix := p.collapse(sub[start:i], OpAlternate)
454
455 re := p.newRegexp(OpConcat)
456 re.Sub = append(re.Sub[:0], prefix, suffix)
457 out = append(out, re)
458 }
459
460
461 start = i
462 str = istr
463 strflags = iflags
464 }
465 sub = out
466
467
468
469
470
471
472
473
474
475 start = 0
476 out = sub[:0]
477 var first *Regexp
478 for i := 0; i <= len(sub); i++ {
479
480
481
482
483
484 var ifirst *Regexp
485 if i < len(sub) {
486 ifirst = p.leadingRegexp(sub[i])
487 if first != nil && first.Equal(ifirst) &&
488
489 (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
490 continue
491 }
492 }
493
494
495
496
497
498 if i == start {
499
500 } else if i == start+1 {
501
502 out = append(out, sub[start])
503 } else {
504
505 prefix := first
506 for j := start; j < i; j++ {
507 reuse := j != start
508 sub[j] = p.removeLeadingRegexp(sub[j], reuse)
509 }
510 suffix := p.collapse(sub[start:i], OpAlternate)
511
512 re := p.newRegexp(OpConcat)
513 re.Sub = append(re.Sub[:0], prefix, suffix)
514 out = append(out, re)
515 }
516
517
518 start = i
519 first = ifirst
520 }
521 sub = out
522
523
524 start = 0
525 out = sub[:0]
526 for i := 0; i <= len(sub); i++ {
527
528
529
530
531
532
533 if i < len(sub) && isCharClass(sub[i]) {
534 continue
535 }
536
537
538
539 if i == start {
540
541 } else if i == start+1 {
542 out = append(out, sub[start])
543 } else {
544
545
546 max := start
547 for j := start + 1; j < i; j++ {
548 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
549 max = j
550 }
551 }
552 sub[start], sub[max] = sub[max], sub[start]
553
554 for j := start + 1; j < i; j++ {
555 mergeCharClass(sub[start], sub[j])
556 p.reuse(sub[j])
557 }
558 cleanAlt(sub[start])
559 out = append(out, sub[start])
560 }
561
562
563 if i < len(sub) {
564 out = append(out, sub[i])
565 }
566 start = i + 1
567 }
568 sub = out
569
570
571 start = 0
572 out = sub[:0]
573 for i := range sub {
574 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
575 continue
576 }
577 out = append(out, sub[i])
578 }
579 sub = out
580
581 return sub
582 }
583
584
585
586 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
587 if re.Op == OpConcat && len(re.Sub) > 0 {
588 re = re.Sub[0]
589 }
590 if re.Op != OpLiteral {
591 return nil, 0
592 }
593 return re.Rune, re.Flags & FoldCase
594 }
595
596
597
598 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
599 if re.Op == OpConcat && len(re.Sub) > 0 {
600
601
602 sub := re.Sub[0]
603 sub = p.removeLeadingString(sub, n)
604 re.Sub[0] = sub
605 if sub.Op == OpEmptyMatch {
606 p.reuse(sub)
607 switch len(re.Sub) {
608 case 0, 1:
609
610 re.Op = OpEmptyMatch
611 re.Sub = nil
612 case 2:
613 old := re
614 re = re.Sub[1]
615 p.reuse(old)
616 default:
617 copy(re.Sub, re.Sub[1:])
618 re.Sub = re.Sub[:len(re.Sub)-1]
619 }
620 }
621 return re
622 }
623
624 if re.Op == OpLiteral {
625 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
626 if len(re.Rune) == 0 {
627 re.Op = OpEmptyMatch
628 }
629 }
630 return re
631 }
632
633
634
635 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
636 if re.Op == OpEmptyMatch {
637 return nil
638 }
639 if re.Op == OpConcat && len(re.Sub) > 0 {
640 sub := re.Sub[0]
641 if sub.Op == OpEmptyMatch {
642 return nil
643 }
644 return sub
645 }
646 return re
647 }
648
649
650
651
652 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
653 if re.Op == OpConcat && len(re.Sub) > 0 {
654 if reuse {
655 p.reuse(re.Sub[0])
656 }
657 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
658 switch len(re.Sub) {
659 case 0:
660 re.Op = OpEmptyMatch
661 re.Sub = nil
662 case 1:
663 old := re
664 re = re.Sub[0]
665 p.reuse(old)
666 }
667 return re
668 }
669 if reuse {
670 p.reuse(re)
671 }
672 return p.newRegexp(OpEmptyMatch)
673 }
674
675 func literalRegexp(s string, flags Flags) *Regexp {
676 re := &Regexp{Op: OpLiteral}
677 re.Flags = flags
678 re.Rune = re.Rune0[:0]
679 for _, c := range s {
680 if len(re.Rune) >= cap(re.Rune) {
681
682 re.Rune = []rune(s)
683 break
684 }
685 re.Rune = append(re.Rune, c)
686 }
687 return re
688 }
689
690
691
692
693
694
695 func Parse(s string, flags Flags) (*Regexp, error) {
696 if flags&Literal != 0 {
697
698 if err := checkUTF8(s); err != nil {
699 return nil, err
700 }
701 return literalRegexp(s, flags), nil
702 }
703
704
705 var (
706 p parser
707 err error
708 c rune
709 op Op
710 lastRepeat string
711 )
712 p.flags = flags
713 p.wholeRegexp = s
714 t := s
715 for t != "" {
716 repeat := ""
717 BigSwitch:
718 switch t[0] {
719 default:
720 if c, t, err = nextRune(t); err != nil {
721 return nil, err
722 }
723 p.literal(c)
724
725 case '(':
726 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
727
728 if t, err = p.parsePerlFlags(t); err != nil {
729 return nil, err
730 }
731 break
732 }
733 p.numCap++
734 p.op(opLeftParen).Cap = p.numCap
735 t = t[1:]
736 case '|':
737 if err = p.parseVerticalBar(); err != nil {
738 return nil, err
739 }
740 t = t[1:]
741 case ')':
742 if err = p.parseRightParen(); err != nil {
743 return nil, err
744 }
745 t = t[1:]
746 case '^':
747 if p.flags&OneLine != 0 {
748 p.op(OpBeginText)
749 } else {
750 p.op(OpBeginLine)
751 }
752 t = t[1:]
753 case '$':
754 if p.flags&OneLine != 0 {
755 p.op(OpEndText).Flags |= WasDollar
756 } else {
757 p.op(OpEndLine)
758 }
759 t = t[1:]
760 case '.':
761 if p.flags&DotNL != 0 {
762 p.op(OpAnyChar)
763 } else {
764 p.op(OpAnyCharNotNL)
765 }
766 t = t[1:]
767 case '[':
768 if t, err = p.parseClass(t); err != nil {
769 return nil, err
770 }
771 case '*', '+', '?':
772 before := t
773 switch t[0] {
774 case '*':
775 op = OpStar
776 case '+':
777 op = OpPlus
778 case '?':
779 op = OpQuest
780 }
781 after := t[1:]
782 if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
783 return nil, err
784 }
785 repeat = before
786 t = after
787 case '{':
788 op = OpRepeat
789 before := t
790 min, max, after, ok := p.parseRepeat(t)
791 if !ok {
792
793 p.literal('{')
794 t = t[1:]
795 break
796 }
797 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
798
799 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
800 }
801 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
802 return nil, err
803 }
804 repeat = before
805 t = after
806 case '\\':
807 if p.flags&PerlX != 0 && len(t) >= 2 {
808 switch t[1] {
809 case 'A':
810 p.op(OpBeginText)
811 t = t[2:]
812 break BigSwitch
813 case 'b':
814 p.op(OpWordBoundary)
815 t = t[2:]
816 break BigSwitch
817 case 'B':
818 p.op(OpNoWordBoundary)
819 t = t[2:]
820 break BigSwitch
821 case 'C':
822
823 return nil, &Error{ErrInvalidEscape, t[:2]}
824 case 'Q':
825
826 var lit string
827 if i := strings.Index(t, `\E`); i < 0 {
828 lit = t[2:]
829 t = ""
830 } else {
831 lit = t[2:i]
832 t = t[i+2:]
833 }
834 for lit != "" {
835 c, rest, err := nextRune(lit)
836 if err != nil {
837 return nil, err
838 }
839 p.literal(c)
840 lit = rest
841 }
842 break BigSwitch
843 case 'z':
844 p.op(OpEndText)
845 t = t[2:]
846 break BigSwitch
847 }
848 }
849
850 re := p.newRegexp(OpCharClass)
851 re.Flags = p.flags
852
853
854 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
855 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
856 if err != nil {
857 return nil, err
858 }
859 if r != nil {
860 re.Rune = r
861 t = rest
862 p.push(re)
863 break BigSwitch
864 }
865 }
866
867
868 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
869 re.Rune = r
870 t = rest
871 p.push(re)
872 break BigSwitch
873 }
874 p.reuse(re)
875
876
877 if c, t, err = p.parseEscape(t); err != nil {
878 return nil, err
879 }
880 p.literal(c)
881 }
882 lastRepeat = repeat
883 }
884
885 p.concat()
886 if p.swapVerticalBar() {
887
888 p.stack = p.stack[:len(p.stack)-1]
889 }
890 p.alternate()
891
892 n := len(p.stack)
893 if n != 1 {
894 return nil, &Error{ErrMissingParen, s}
895 }
896 return p.stack[0], nil
897 }
898
899
900
901
902 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
903 if s == "" || s[0] != '{' {
904 return
905 }
906 s = s[1:]
907 var ok1 bool
908 if min, s, ok1 = p.parseInt(s); !ok1 {
909 return
910 }
911 if s == "" {
912 return
913 }
914 if s[0] != ',' {
915 max = min
916 } else {
917 s = s[1:]
918 if s == "" {
919 return
920 }
921 if s[0] == '}' {
922 max = -1
923 } else if max, s, ok1 = p.parseInt(s); !ok1 {
924 return
925 } else if max < 0 {
926
927 min = -1
928 }
929 }
930 if s == "" || s[0] != '}' {
931 return
932 }
933 rest = s[1:]
934 ok = true
935 return
936 }
937
938
939
940
941 func (p *parser) parsePerlFlags(s string) (rest string, err error) {
942 t := s
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959 if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
960
961 end := strings.IndexRune(t, '>')
962 if end < 0 {
963 if err = checkUTF8(t); err != nil {
964 return "", err
965 }
966 return "", &Error{ErrInvalidNamedCapture, s}
967 }
968
969 capture := t[:end+1]
970 name := t[4:end]
971 if err = checkUTF8(name); err != nil {
972 return "", err
973 }
974 if !isValidCaptureName(name) {
975 return "", &Error{ErrInvalidNamedCapture, capture}
976 }
977
978
979 p.numCap++
980 re := p.op(opLeftParen)
981 re.Cap = p.numCap
982 re.Name = name
983 return t[end+1:], nil
984 }
985
986
987 var c rune
988 t = t[2:]
989 flags := p.flags
990 sign := +1
991 sawFlag := false
992 Loop:
993 for t != "" {
994 if c, t, err = nextRune(t); err != nil {
995 return "", err
996 }
997 switch c {
998 default:
999 break Loop
1000
1001
1002 case 'i':
1003 flags |= FoldCase
1004 sawFlag = true
1005 case 'm':
1006 flags &^= OneLine
1007 sawFlag = true
1008 case 's':
1009 flags |= DotNL
1010 sawFlag = true
1011 case 'U':
1012 flags |= NonGreedy
1013 sawFlag = true
1014
1015
1016 case '-':
1017 if sign < 0 {
1018 break Loop
1019 }
1020 sign = -1
1021
1022
1023 flags = ^flags
1024 sawFlag = false
1025
1026
1027 case ':', ')':
1028 if sign < 0 {
1029 if !sawFlag {
1030 break Loop
1031 }
1032 flags = ^flags
1033 }
1034 if c == ':' {
1035
1036 p.op(opLeftParen)
1037 }
1038 p.flags = flags
1039 return t, nil
1040 }
1041 }
1042
1043 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
1044 }
1045
1046
1047
1048
1049
1050
1051 func isValidCaptureName(name string) bool {
1052 if name == "" {
1053 return false
1054 }
1055 for _, c := range name {
1056 if c != '_' && !isalnum(c) {
1057 return false
1058 }
1059 }
1060 return true
1061 }
1062
1063
1064 func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1065 if s == "" || s[0] < '0' || '9' < s[0] {
1066 return
1067 }
1068
1069 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1070 return
1071 }
1072 t := s
1073 for s != "" && '0' <= s[0] && s[0] <= '9' {
1074 s = s[1:]
1075 }
1076 rest = s
1077 ok = true
1078
1079 t = t[:len(t)-len(s)]
1080 for i := 0; i < len(t); i++ {
1081
1082 if n >= 1e8 {
1083 n = -1
1084 break
1085 }
1086 n = n*10 + int(t[i]) - '0'
1087 }
1088 return
1089 }
1090
1091
1092
1093 func isCharClass(re *Regexp) bool {
1094 return re.Op == OpLiteral && len(re.Rune) == 1 ||
1095 re.Op == OpCharClass ||
1096 re.Op == OpAnyCharNotNL ||
1097 re.Op == OpAnyChar
1098 }
1099
1100
1101 func matchRune(re *Regexp, r rune) bool {
1102 switch re.Op {
1103 case OpLiteral:
1104 return len(re.Rune) == 1 && re.Rune[0] == r
1105 case OpCharClass:
1106 for i := 0; i < len(re.Rune); i += 2 {
1107 if re.Rune[i] <= r && r <= re.Rune[i+1] {
1108 return true
1109 }
1110 }
1111 return false
1112 case OpAnyCharNotNL:
1113 return r != '\n'
1114 case OpAnyChar:
1115 return true
1116 }
1117 return false
1118 }
1119
1120
1121 func (p *parser) parseVerticalBar() error {
1122 p.concat()
1123
1124
1125
1126
1127
1128 if !p.swapVerticalBar() {
1129 p.op(opVerticalBar)
1130 }
1131
1132 return nil
1133 }
1134
1135
1136
1137
1138 func mergeCharClass(dst, src *Regexp) {
1139 switch dst.Op {
1140 case OpAnyChar:
1141
1142 case OpAnyCharNotNL:
1143
1144 if matchRune(src, '\n') {
1145 dst.Op = OpAnyChar
1146 }
1147 case OpCharClass:
1148
1149 if src.Op == OpLiteral {
1150 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1151 } else {
1152 dst.Rune = appendClass(dst.Rune, src.Rune)
1153 }
1154 case OpLiteral:
1155
1156 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1157 break
1158 }
1159 dst.Op = OpCharClass
1160 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1161 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1162 }
1163 }
1164
1165
1166
1167
1168 func (p *parser) swapVerticalBar() bool {
1169
1170
1171 n := len(p.stack)
1172 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1173 re1 := p.stack[n-1]
1174 re3 := p.stack[n-3]
1175
1176 if re1.Op > re3.Op {
1177 re1, re3 = re3, re1
1178 p.stack[n-3] = re3
1179 }
1180 mergeCharClass(re3, re1)
1181 p.reuse(re1)
1182 p.stack = p.stack[:n-1]
1183 return true
1184 }
1185
1186 if n >= 2 {
1187 re1 := p.stack[n-1]
1188 re2 := p.stack[n-2]
1189 if re2.Op == opVerticalBar {
1190 if n >= 3 {
1191
1192
1193 cleanAlt(p.stack[n-3])
1194 }
1195 p.stack[n-2] = re1
1196 p.stack[n-1] = re2
1197 return true
1198 }
1199 }
1200 return false
1201 }
1202
1203
1204 func (p *parser) parseRightParen() error {
1205 p.concat()
1206 if p.swapVerticalBar() {
1207
1208 p.stack = p.stack[:len(p.stack)-1]
1209 }
1210 p.alternate()
1211
1212 n := len(p.stack)
1213 if n < 2 {
1214 return &Error{ErrUnexpectedParen, p.wholeRegexp}
1215 }
1216 re1 := p.stack[n-1]
1217 re2 := p.stack[n-2]
1218 p.stack = p.stack[:n-2]
1219 if re2.Op != opLeftParen {
1220 return &Error{ErrUnexpectedParen, p.wholeRegexp}
1221 }
1222
1223 p.flags = re2.Flags
1224 if re2.Cap == 0 {
1225
1226 p.push(re1)
1227 } else {
1228 re2.Op = OpCapture
1229 re2.Sub = re2.Sub0[:1]
1230 re2.Sub[0] = re1
1231 p.push(re2)
1232 }
1233 return nil
1234 }
1235
1236
1237
1238 func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1239 t := s[1:]
1240 if t == "" {
1241 return 0, "", &Error{ErrTrailingBackslash, ""}
1242 }
1243 c, t, err := nextRune(t)
1244 if err != nil {
1245 return 0, "", err
1246 }
1247
1248 Switch:
1249 switch c {
1250 default:
1251 if c < utf8.RuneSelf && !isalnum(c) {
1252
1253
1254
1255
1256 return c, t, nil
1257 }
1258
1259
1260 case '1', '2', '3', '4', '5', '6', '7':
1261
1262 if t == "" || t[0] < '0' || t[0] > '7' {
1263 break
1264 }
1265 fallthrough
1266 case '0':
1267
1268 r = c - '0'
1269 for i := 1; i < 3; i++ {
1270 if t == "" || t[0] < '0' || t[0] > '7' {
1271 break
1272 }
1273 r = r*8 + rune(t[0]) - '0'
1274 t = t[1:]
1275 }
1276 return r, t, nil
1277
1278
1279 case 'x':
1280 if t == "" {
1281 break
1282 }
1283 if c, t, err = nextRune(t); err != nil {
1284 return 0, "", err
1285 }
1286 if c == '{' {
1287
1288
1289
1290
1291 nhex := 0
1292 r = 0
1293 for {
1294 if t == "" {
1295 break Switch
1296 }
1297 if c, t, err = nextRune(t); err != nil {
1298 return 0, "", err
1299 }
1300 if c == '}' {
1301 break
1302 }
1303 v := unhex(c)
1304 if v < 0 {
1305 break Switch
1306 }
1307 r = r*16 + v
1308 if r > unicode.MaxRune {
1309 break Switch
1310 }
1311 nhex++
1312 }
1313 if nhex == 0 {
1314 break Switch
1315 }
1316 return r, t, nil
1317 }
1318
1319
1320 x := unhex(c)
1321 if c, t, err = nextRune(t); err != nil {
1322 return 0, "", err
1323 }
1324 y := unhex(c)
1325 if x < 0 || y < 0 {
1326 break
1327 }
1328 return x*16 + y, t, nil
1329
1330
1331
1332
1333
1334
1335
1336 case 'a':
1337 return '\a', t, err
1338 case 'f':
1339 return '\f', t, err
1340 case 'n':
1341 return '\n', t, err
1342 case 'r':
1343 return '\r', t, err
1344 case 't':
1345 return '\t', t, err
1346 case 'v':
1347 return '\v', t, err
1348 }
1349 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1350 }
1351
1352
1353
1354 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1355 if s == "" {
1356 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1357 }
1358
1359
1360
1361 if s[0] == '\\' {
1362 return p.parseEscape(s)
1363 }
1364
1365 return nextRune(s)
1366 }
1367
1368 type charGroup struct {
1369 sign int
1370 class []rune
1371 }
1372
1373
1374
1375
1376 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1377 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1378 return
1379 }
1380 g := perlGroup[s[0:2]]
1381 if g.sign == 0 {
1382 return
1383 }
1384 return p.appendGroup(r, g), s[2:]
1385 }
1386
1387
1388
1389
1390 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1391 if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1392 return
1393 }
1394
1395 i := strings.Index(s[2:], ":]")
1396 if i < 0 {
1397 return
1398 }
1399 i += 2
1400 name, s := s[0:i+2], s[i+2:]
1401 g := posixGroup[name]
1402 if g.sign == 0 {
1403 return nil, "", &Error{ErrInvalidCharRange, name}
1404 }
1405 return p.appendGroup(r, g), s, nil
1406 }
1407
1408 func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1409 if p.flags&FoldCase == 0 {
1410 if g.sign < 0 {
1411 r = appendNegatedClass(r, g.class)
1412 } else {
1413 r = appendClass(r, g.class)
1414 }
1415 } else {
1416 tmp := p.tmpClass[:0]
1417 tmp = appendFoldedClass(tmp, g.class)
1418 p.tmpClass = tmp
1419 tmp = cleanClass(&p.tmpClass)
1420 if g.sign < 0 {
1421 r = appendNegatedClass(r, tmp)
1422 } else {
1423 r = appendClass(r, tmp)
1424 }
1425 }
1426 return r
1427 }
1428
1429 var anyTable = &unicode.RangeTable{
1430 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
1431 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
1432 }
1433
1434
1435
1436 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1437
1438 if name == "Any" {
1439 return anyTable, anyTable
1440 }
1441 if t := unicode.Categories[name]; t != nil {
1442 return t, unicode.FoldCategory[name]
1443 }
1444 if t := unicode.Scripts[name]; t != nil {
1445 return t, unicode.FoldScript[name]
1446 }
1447 return nil, nil
1448 }
1449
1450
1451
1452
1453 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1454 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1455 return
1456 }
1457
1458
1459 sign := +1
1460 if s[1] == 'P' {
1461 sign = -1
1462 }
1463 t := s[2:]
1464 c, t, err := nextRune(t)
1465 if err != nil {
1466 return
1467 }
1468 var seq, name string
1469 if c != '{' {
1470
1471 seq = s[:len(s)-len(t)]
1472 name = seq[2:]
1473 } else {
1474
1475 end := strings.IndexRune(s, '}')
1476 if end < 0 {
1477 if err = checkUTF8(s); err != nil {
1478 return
1479 }
1480 return nil, "", &Error{ErrInvalidCharRange, s}
1481 }
1482 seq, t = s[:end+1], s[end+1:]
1483 name = s[3:end]
1484 if err = checkUTF8(name); err != nil {
1485 return
1486 }
1487 }
1488
1489
1490 if name != "" && name[0] == '^' {
1491 sign = -sign
1492 name = name[1:]
1493 }
1494
1495 tab, fold := unicodeTable(name)
1496 if tab == nil {
1497 return nil, "", &Error{ErrInvalidCharRange, seq}
1498 }
1499
1500 if p.flags&FoldCase == 0 || fold == nil {
1501 if sign > 0 {
1502 r = appendTable(r, tab)
1503 } else {
1504 r = appendNegatedTable(r, tab)
1505 }
1506 } else {
1507
1508
1509
1510 tmp := p.tmpClass[:0]
1511 tmp = appendTable(tmp, tab)
1512 tmp = appendTable(tmp, fold)
1513 p.tmpClass = tmp
1514 tmp = cleanClass(&p.tmpClass)
1515 if sign > 0 {
1516 r = appendClass(r, tmp)
1517 } else {
1518 r = appendNegatedClass(r, tmp)
1519 }
1520 }
1521 return r, t, nil
1522 }
1523
1524
1525
1526 func (p *parser) parseClass(s string) (rest string, err error) {
1527 t := s[1:]
1528 re := p.newRegexp(OpCharClass)
1529 re.Flags = p.flags
1530 re.Rune = re.Rune0[:0]
1531
1532 sign := +1
1533 if t != "" && t[0] == '^' {
1534 sign = -1
1535 t = t[1:]
1536
1537
1538
1539 if p.flags&ClassNL == 0 {
1540 re.Rune = append(re.Rune, '\n', '\n')
1541 }
1542 }
1543
1544 class := re.Rune
1545 first := true
1546 for t == "" || t[0] != ']' || first {
1547
1548
1549 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1550 _, size := utf8.DecodeRuneInString(t[1:])
1551 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1552 }
1553 first = false
1554
1555
1556 if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1557 nclass, nt, err := p.parseNamedClass(t, class)
1558 if err != nil {
1559 return "", err
1560 }
1561 if nclass != nil {
1562 class, t = nclass, nt
1563 continue
1564 }
1565 }
1566
1567
1568 nclass, nt, err := p.parseUnicodeClass(t, class)
1569 if err != nil {
1570 return "", err
1571 }
1572 if nclass != nil {
1573 class, t = nclass, nt
1574 continue
1575 }
1576
1577
1578 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1579 class, t = nclass, nt
1580 continue
1581 }
1582
1583
1584 rng := t
1585 var lo, hi rune
1586 if lo, t, err = p.parseClassChar(t, s); err != nil {
1587 return "", err
1588 }
1589 hi = lo
1590
1591 if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1592 t = t[1:]
1593 if hi, t, err = p.parseClassChar(t, s); err != nil {
1594 return "", err
1595 }
1596 if hi < lo {
1597 rng = rng[:len(rng)-len(t)]
1598 return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1599 }
1600 }
1601 if p.flags&FoldCase == 0 {
1602 class = appendRange(class, lo, hi)
1603 } else {
1604 class = appendFoldedRange(class, lo, hi)
1605 }
1606 }
1607 t = t[1:]
1608
1609
1610 re.Rune = class
1611 class = cleanClass(&re.Rune)
1612 if sign < 0 {
1613 class = negateClass(class)
1614 }
1615 re.Rune = class
1616 p.push(re)
1617 return t, nil
1618 }
1619
1620
1621
1622 func cleanClass(rp *[]rune) []rune {
1623
1624
1625 sort.Sort(ranges{rp})
1626
1627 r := *rp
1628 if len(r) < 2 {
1629 return r
1630 }
1631
1632
1633 w := 2
1634 for i := 2; i < len(r); i += 2 {
1635 lo, hi := r[i], r[i+1]
1636 if lo <= r[w-1]+1 {
1637
1638 if hi > r[w-1] {
1639 r[w-1] = hi
1640 }
1641 continue
1642 }
1643
1644 r[w] = lo
1645 r[w+1] = hi
1646 w += 2
1647 }
1648
1649 return r[:w]
1650 }
1651
1652
1653 func appendLiteral(r []rune, x rune, flags Flags) []rune {
1654 if flags&FoldCase != 0 {
1655 return appendFoldedRange(r, x, x)
1656 }
1657 return appendRange(r, x, x)
1658 }
1659
1660
1661 func appendRange(r []rune, lo, hi rune) []rune {
1662
1663
1664
1665
1666 n := len(r)
1667 for i := 2; i <= 4; i += 2 {
1668 if n >= i {
1669 rlo, rhi := r[n-i], r[n-i+1]
1670 if lo <= rhi+1 && rlo <= hi+1 {
1671 if lo < rlo {
1672 r[n-i] = lo
1673 }
1674 if hi > rhi {
1675 r[n-i+1] = hi
1676 }
1677 return r
1678 }
1679 }
1680 }
1681
1682 return append(r, lo, hi)
1683 }
1684
1685 const (
1686
1687
1688 minFold = 0x0041
1689 maxFold = 0x1e943
1690 )
1691
1692
1693
1694 func appendFoldedRange(r []rune, lo, hi rune) []rune {
1695
1696 if lo <= minFold && hi >= maxFold {
1697
1698 return appendRange(r, lo, hi)
1699 }
1700 if hi < minFold || lo > maxFold {
1701
1702 return appendRange(r, lo, hi)
1703 }
1704 if lo < minFold {
1705
1706 r = appendRange(r, lo, minFold-1)
1707 lo = minFold
1708 }
1709 if hi > maxFold {
1710
1711 r = appendRange(r, maxFold+1, hi)
1712 hi = maxFold
1713 }
1714
1715
1716 for c := lo; c <= hi; c++ {
1717 r = appendRange(r, c, c)
1718 f := unicode.SimpleFold(c)
1719 for f != c {
1720 r = appendRange(r, f, f)
1721 f = unicode.SimpleFold(f)
1722 }
1723 }
1724 return r
1725 }
1726
1727
1728
1729 func appendClass(r []rune, x []rune) []rune {
1730 for i := 0; i < len(x); i += 2 {
1731 r = appendRange(r, x[i], x[i+1])
1732 }
1733 return r
1734 }
1735
1736
1737 func appendFoldedClass(r []rune, x []rune) []rune {
1738 for i := 0; i < len(x); i += 2 {
1739 r = appendFoldedRange(r, x[i], x[i+1])
1740 }
1741 return r
1742 }
1743
1744
1745
1746 func appendNegatedClass(r []rune, x []rune) []rune {
1747 nextLo := '\u0000'
1748 for i := 0; i < len(x); i += 2 {
1749 lo, hi := x[i], x[i+1]
1750 if nextLo <= lo-1 {
1751 r = appendRange(r, nextLo, lo-1)
1752 }
1753 nextLo = hi + 1
1754 }
1755 if nextLo <= unicode.MaxRune {
1756 r = appendRange(r, nextLo, unicode.MaxRune)
1757 }
1758 return r
1759 }
1760
1761
1762 func appendTable(r []rune, x *unicode.RangeTable) []rune {
1763 for _, xr := range x.R16 {
1764 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1765 if stride == 1 {
1766 r = appendRange(r, lo, hi)
1767 continue
1768 }
1769 for c := lo; c <= hi; c += stride {
1770 r = appendRange(r, c, c)
1771 }
1772 }
1773 for _, xr := range x.R32 {
1774 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1775 if stride == 1 {
1776 r = appendRange(r, lo, hi)
1777 continue
1778 }
1779 for c := lo; c <= hi; c += stride {
1780 r = appendRange(r, c, c)
1781 }
1782 }
1783 return r
1784 }
1785
1786
1787 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
1788 nextLo := '\u0000'
1789 for _, xr := range x.R16 {
1790 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1791 if stride == 1 {
1792 if nextLo <= lo-1 {
1793 r = appendRange(r, nextLo, lo-1)
1794 }
1795 nextLo = hi + 1
1796 continue
1797 }
1798 for c := lo; c <= hi; c += stride {
1799 if nextLo <= c-1 {
1800 r = appendRange(r, nextLo, c-1)
1801 }
1802 nextLo = c + 1
1803 }
1804 }
1805 for _, xr := range x.R32 {
1806 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1807 if stride == 1 {
1808 if nextLo <= lo-1 {
1809 r = appendRange(r, nextLo, lo-1)
1810 }
1811 nextLo = hi + 1
1812 continue
1813 }
1814 for c := lo; c <= hi; c += stride {
1815 if nextLo <= c-1 {
1816 r = appendRange(r, nextLo, c-1)
1817 }
1818 nextLo = c + 1
1819 }
1820 }
1821 if nextLo <= unicode.MaxRune {
1822 r = appendRange(r, nextLo, unicode.MaxRune)
1823 }
1824 return r
1825 }
1826
1827
1828
1829 func negateClass(r []rune) []rune {
1830 nextLo := '\u0000'
1831 w := 0
1832 for i := 0; i < len(r); i += 2 {
1833 lo, hi := r[i], r[i+1]
1834 if nextLo <= lo-1 {
1835 r[w] = nextLo
1836 r[w+1] = lo - 1
1837 w += 2
1838 }
1839 nextLo = hi + 1
1840 }
1841 r = r[:w]
1842 if nextLo <= unicode.MaxRune {
1843
1844
1845 r = append(r, nextLo, unicode.MaxRune)
1846 }
1847 return r
1848 }
1849
1850
1851
1852
1853
1854 type ranges struct {
1855 p *[]rune
1856 }
1857
1858 func (ra ranges) Less(i, j int) bool {
1859 p := *ra.p
1860 i *= 2
1861 j *= 2
1862 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1863 }
1864
1865 func (ra ranges) Len() int {
1866 return len(*ra.p) / 2
1867 }
1868
1869 func (ra ranges) Swap(i, j int) {
1870 p := *ra.p
1871 i *= 2
1872 j *= 2
1873 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1874 }
1875
1876 func checkUTF8(s string) error {
1877 for s != "" {
1878 rune, size := utf8.DecodeRuneInString(s)
1879 if rune == utf8.RuneError && size == 1 {
1880 return &Error{Code: ErrInvalidUTF8, Expr: s}
1881 }
1882 s = s[size:]
1883 }
1884 return nil
1885 }
1886
1887 func nextRune(s string) (c rune, t string, err error) {
1888 c, size := utf8.DecodeRuneInString(s)
1889 if c == utf8.RuneError && size == 1 {
1890 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1891 }
1892 return c, s[size:], nil
1893 }
1894
1895 func isalnum(c rune) bool {
1896 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1897 }
1898
1899 func unhex(c rune) rune {
1900 if '0' <= c && c <= '9' {
1901 return c - '0'
1902 }
1903 if 'a' <= c && c <= 'f' {
1904 return c - 'a' + 10
1905 }
1906 if 'A' <= c && c <= 'F' {
1907 return c - 'A' + 10
1908 }
1909 return -1
1910 }
1911
View as plain text