-
Notifications
You must be signed in to change notification settings - Fork 6
/
reg-expr.h
192 lines (181 loc) · 7.08 KB
/
reg-expr.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/*
* regex - Regular expression pattern matching
* and replacement
*
* By: Ozan S. Yigit (oz), Dept. of Computer Science, York University
* Mods: Craig Durland
*
* This version modified by Harlan Sexton to check for insufficient space
* in the buffer supplied to regexp_comp() for a pattern.
*
* These routines are the PUBLIC DOMAIN equivalents of regex routines as
* found in 4.nBSD UN*X, with minor extensions.
*
* These routines are derived from various implementations found in software
* tools books, and Conroy's grep. They are NOT derived from
* licensed/restricted software. For more interesting/academic/complicated
* implementations, see Henry Spencer's regexp routines, or GNU Emacs
* pattern matching module.
*
* dfa = deterministic finite automata
* Routines:
* regexp_comp: compile a regular expression into a DFA.
* char *regexp_comp(s)
* char *s;
* returns: NULL if OK, else error string
* If s is NULL or 0 length, last compiled pattern is used.
* regexp_exec: execute the DFA to match a pattern.
* int regexp_exec(s)
* char *s;
* regexp_subs: substitute the matched portions in a new string.
* int regexp_subs(src, dst)
* char *src;
* char *dst;
* regexp_fail: failure routine for regexp_exec.
* void regexp_fail(msg, op)
* char *msg;
* char op;
*
* Regular Expressions:
*
* [1] char matches itself, unless it is a special
* character (metachar): . \ [ ] * + ^ $
*
* [2] . matches any character.
*
* [3] \ matches the character following it, except
* when followed by one of: ()123456789<> adnwW
* (see [7], [8] and [9])
* It is used as an escape character for all other
* meta-characters, and itself. When used in a set
* ([4]), it is treated as an ordinary character.
*
* [4] [set] matches one of the characters in the set.
* If the first character in the set is "^",
* it matches a character NOT in the set. A
* shorthand S-E is used to specify a set of
* characters S upto E, inclusive. The special
* characters "]" and "-" have no special
* meaning if they appear as the first chars
* in the set.
* examples: match:
*
* [a-z] any lowercase alpha
*
* [^]-] any char except ] and -
*
* [^A-Z] any char except uppercase
* alpha
*
* [a-zA-Z] any alpha
*
* [5] * any regular expression form [1] to [4], followed by
* closure char (*) matches zero or more matches of
* that form.
*
* [6] + same as [5], except it matches one or more.
*
* [7] a regular expression in the form [1] to [10], enclosed
* as \(form\) matches what form matches. The enclosure
* creates a set of tags, used for [8] and for
* pattern substution. The tagged forms are numbered
* starting from 1.
*
* [8] a \ followed by a digit 1 to 9 matches whatever a
* previously tagged regular expression ([7]) matched.
*
* [9] \< a regular expression starting with a \< construct
* \> and/or ending with a \> construct, restricts the
* pattern matching to the beginning of a word, and/or
* the end of a word. A word is defined to be a character
* string beginning and/or ending with the characters
* A-Z a-z 0-9 and _. It must also be preceded and/or
* followed by any character outside those mentioned.
*
* [10] a composite regular expression xy where x and y
* are in the form [1] to [10] matches the longest
* match of x followed by a match for y.
*
* [11] ^ a regular expression starting with a ^ character
* $ and/or ending with a $ character, restricts the
* pattern matching to the beginning of the line,
* or the end of line. [anchors] Elsewhere in the
* pattern, ^ and $ are treated as ordinary characters.
*
* Acknowledgements:
* HCR's Hugh Redelmeier has been most helpful in various stages of
* development. He convinced me to include BOW and EOW constructs,
* originally invented by Rob Pike at the University of Toronto.
* References:
* Software tools Kernighan & Plauger
* Software tools in Pascal Kernighan & Plauger
* Grep [rsx-11 C dist] David Conroy
* ed - text editor Un*x Programmer's Manual
* Advanced editing on Un*x B. W. Kernighan
* RegExp routines Henry Spencer
* Notes:
* This implementation uses a bit-set representation for character sets for
* speed and compactness. Each character is represented by one bit in a
* 128-bit block. Thus, SET or NSET always takes a constant 16 bytes in
* the internal dfa, and regexp_exec does a single bit comparison to locate
* the character in the set.
* Put CLO in front of what gets closed for ease of interpreting.
* Put END at end of what gets closed to limit recursion.
* Examples:
* pattern: foo*.*
* compile: CHR f CHR o CLO CHR o END CLO ANY END END
* matches: fo foo fooo foobar fobar foxx ...
*
* pattern: fo[ob]a[rz]
* compile: CHR f CHR o SET bitset CHR a SET bitset END
* matches: fobar fooar fobaz fooaz
*
* pattern: foo\\+
* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
* matches: foo\ foo\\ foo\\\ ...
*
* pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
* compile: BOT 1 CHR f CHR o CHR o EOT 1 SET bitset REF 1 END
* matches: foo1foo foo2foo foo3foo
*
* pattern: \(fo.*\)-\1
* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
* matches: foo-foo fo-fo fob-fob foobar-foobar ...
*/
/* $Id: reg-expr.h,v 1.2 1993/05/07 06:59:27 hbs Exp $ */
#ifndef REG_EXPR_DOT_H
#define REG_EXPR_DOT_H
#ifdef __cplusplus
extern "C" {
#endif
/* regexp_comp(): compile a regular expression into a DFA.
returns: NULL if OK, else error string
If s is NULL or 0 length, last compiled pattern is used.
*/
unsigned char*
regexp_comp (unsigned char* s, unsigned char* automaton, int bufsize);
/* regexp_exec: execute the DFA to match a pattern.
*/
int
regexp_exec (unsigned char* s, int SoL, int move, unsigned char* automaton);
/* regexp_subs: substitute the matched portions in a new string.
*/
int
regexp_subs (unsigned char *src, unsigned char *dst);
/* regexp_fail: failure routine for regexp_exec.
*/
void
regexp_fail (unsigned char *msg, unsigned char op);
/* beginning of pattern and end of pattern contanin the start and end position
of the the matched pieces.
bopat [0], eopat [0] is the total match
bopat [1-9] .. is the part matched by \1-9
*/
extern unsigned char *regext_bopat[], *regexp_eopat[];
/* error string returned if the buffer supplied to regexp_comp() is
too short for the compiled pattern */
extern unsigned char *regexp_dfa_buffer_too_short;
#ifdef __cplusplus
}
#endif
#endif /* REG_EXPR_DOT_H */