You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
689 lines
16 KiB
689 lines
16 KiB
/* |
|
XXX support range parameter on search |
|
XXX support mstop parameter on search |
|
*/ |
|
|
|
|
|
/* Regular expression objects */ |
|
/* This uses Tatu Ylonen's copyleft-free reimplementation of |
|
GNU regular expressions */ |
|
|
|
#include "Python.h" |
|
|
|
#include <ctype.h> |
|
|
|
#include "regexpr.h" |
|
|
|
static PyObject *RegexError; /* Exception */ |
|
|
|
typedef struct { |
|
PyObject_HEAD |
|
struct re_pattern_buffer re_patbuf; /* The compiled expression */ |
|
struct re_registers re_regs; /* The registers from the last match */ |
|
char re_fastmap[256]; /* Storage for fastmap */ |
|
PyObject *re_translate; /* String object for translate table */ |
|
PyObject *re_lastok; /* String object last matched/searched */ |
|
PyObject *re_groupindex; /* Group name to index dictionary */ |
|
PyObject *re_givenpat; /* Pattern with symbolic groups */ |
|
PyObject *re_realpat; /* Pattern without symbolic groups */ |
|
} regexobject; |
|
|
|
/* Regex object methods */ |
|
|
|
static void |
|
reg_dealloc(regexobject *re) |
|
{ |
|
if (re->re_patbuf.buffer) |
|
free(re->re_patbuf.buffer); |
|
Py_XDECREF(re->re_translate); |
|
Py_XDECREF(re->re_lastok); |
|
Py_XDECREF(re->re_groupindex); |
|
Py_XDECREF(re->re_givenpat); |
|
Py_XDECREF(re->re_realpat); |
|
PyObject_Del(re); |
|
} |
|
|
|
static PyObject * |
|
makeresult(struct re_registers *regs) |
|
{ |
|
PyObject *v; |
|
int i; |
|
static PyObject *filler = NULL; |
|
|
|
if (filler == NULL) { |
|
filler = Py_BuildValue("(ii)", -1, -1); |
|
if (filler == NULL) |
|
return NULL; |
|
} |
|
v = PyTuple_New(RE_NREGS); |
|
if (v == NULL) |
|
return NULL; |
|
|
|
for (i = 0; i < RE_NREGS; i++) { |
|
int lo = regs->start[i]; |
|
int hi = regs->end[i]; |
|
PyObject *w; |
|
if (lo == -1 && hi == -1) { |
|
w = filler; |
|
Py_INCREF(w); |
|
} |
|
else |
|
w = Py_BuildValue("(ii)", lo, hi); |
|
if (w == NULL || PyTuple_SetItem(v, i, w) < 0) { |
|
Py_DECREF(v); |
|
return NULL; |
|
} |
|
} |
|
return v; |
|
} |
|
|
|
static PyObject * |
|
regobj_match(regexobject *re, PyObject *args) |
|
{ |
|
PyObject *argstring; |
|
char *buffer; |
|
int size; |
|
int offset = 0; |
|
int result; |
|
|
|
if (!PyArg_ParseTuple(args, "O|i:match", &argstring, &offset)) |
|
return NULL; |
|
if (!PyArg_Parse(argstring, "t#", &buffer, &size)) |
|
return NULL; |
|
|
|
if (offset < 0 || offset > size) { |
|
PyErr_SetString(RegexError, "match offset out of range"); |
|
return NULL; |
|
} |
|
Py_XDECREF(re->re_lastok); |
|
re->re_lastok = NULL; |
|
result = _Py_re_match(&re->re_patbuf, (unsigned char *)buffer, size, offset, |
|
&re->re_regs); |
|
if (result < -1) { |
|
/* Serious failure of some sort; if re_match didn't |
|
set an exception, raise a generic error */ |
|
if (!PyErr_Occurred()) |
|
PyErr_SetString(RegexError, "match failure"); |
|
return NULL; |
|
} |
|
if (result >= 0) { |
|
Py_INCREF(argstring); |
|
re->re_lastok = argstring; |
|
} |
|
return PyInt_FromLong((long)result); /* Length of the match or -1 */ |
|
} |
|
|
|
static PyObject * |
|
regobj_search(regexobject *re, PyObject *args) |
|
{ |
|
PyObject *argstring; |
|
char *buffer; |
|
int size; |
|
int offset = 0; |
|
int range; |
|
int result; |
|
|
|
if (!PyArg_ParseTuple(args, "O|i:search", &argstring, &offset)) |
|
return NULL; |
|
if (!PyArg_Parse(argstring, "t#:search", &buffer, &size)) |
|
return NULL; |
|
|
|
if (offset < 0 || offset > size) { |
|
PyErr_SetString(RegexError, "search offset out of range"); |
|
return NULL; |
|
} |
|
/* NB: In Emacs 18.57, the documentation for re_search[_2] and |
|
the implementation don't match: the documentation states that |
|
|range| positions are tried, while the code tries |range|+1 |
|
positions. It seems more productive to believe the code! */ |
|
range = size - offset; |
|
Py_XDECREF(re->re_lastok); |
|
re->re_lastok = NULL; |
|
result = _Py_re_search(&re->re_patbuf, (unsigned char *)buffer, size, offset, range, |
|
&re->re_regs); |
|
if (result < -1) { |
|
/* Serious failure of some sort; if re_match didn't |
|
set an exception, raise a generic error */ |
|
if (!PyErr_Occurred()) |
|
PyErr_SetString(RegexError, "match failure"); |
|
return NULL; |
|
} |
|
if (result >= 0) { |
|
Py_INCREF(argstring); |
|
re->re_lastok = argstring; |
|
} |
|
return PyInt_FromLong((long)result); /* Position of the match or -1 */ |
|
} |
|
|
|
/* get the group from the regex where index can be a string (group name) or |
|
an integer index [0 .. 99] |
|
*/ |
|
static PyObject* |
|
group_from_index(regexobject *re, PyObject *index) |
|
{ |
|
int i, a, b; |
|
char *v; |
|
|
|
if (PyString_Check(index)) |
|
if (re->re_groupindex == NULL || |
|
!(index = PyDict_GetItem(re->re_groupindex, index))) |
|
{ |
|
PyErr_SetString(RegexError, |
|
"group() group name doesn't exist"); |
|
return NULL; |
|
} |
|
|
|
i = PyInt_AsLong(index); |
|
if (i == -1 && PyErr_Occurred()) |
|
return NULL; |
|
|
|
if (i < 0 || i >= RE_NREGS) { |
|
PyErr_SetString(RegexError, "group() index out of range"); |
|
return NULL; |
|
} |
|
if (re->re_lastok == NULL) { |
|
PyErr_SetString(RegexError, |
|
"group() only valid after successful match/search"); |
|
return NULL; |
|
} |
|
a = re->re_regs.start[i]; |
|
b = re->re_regs.end[i]; |
|
if (a < 0 || b < 0) { |
|
Py_INCREF(Py_None); |
|
return Py_None; |
|
} |
|
|
|
if (!(v = PyString_AsString(re->re_lastok))) |
|
return NULL; |
|
|
|
return PyString_FromStringAndSize(v+a, b-a); |
|
} |
|
|
|
|
|
static PyObject * |
|
regobj_group(regexobject *re, PyObject *args) |
|
{ |
|
int n = PyTuple_Size(args); |
|
int i; |
|
PyObject *res = NULL; |
|
|
|
if (n < 0) |
|
return NULL; |
|
if (n == 0) { |
|
PyErr_SetString(PyExc_TypeError, "not enough arguments"); |
|
return NULL; |
|
} |
|
if (n == 1) { |
|
/* return value is a single string */ |
|
PyObject *index = PyTuple_GetItem(args, 0); |
|
if (!index) |
|
return NULL; |
|
|
|
return group_from_index(re, index); |
|
} |
|
|
|
/* return value is a tuple */ |
|
if (!(res = PyTuple_New(n))) |
|
return NULL; |
|
|
|
for (i = 0; i < n; i++) { |
|
PyObject *index = PyTuple_GetItem(args, i); |
|
PyObject *group = NULL; |
|
|
|
if (!index) |
|
goto finally; |
|
if (!(group = group_from_index(re, index))) |
|
goto finally; |
|
if (PyTuple_SetItem(res, i, group) < 0) |
|
goto finally; |
|
} |
|
return res; |
|
|
|
finally: |
|
Py_DECREF(res); |
|
return NULL; |
|
} |
|
|
|
|
|
static struct PyMethodDef reg_methods[] = { |
|
{"match", (PyCFunction)regobj_match, METH_VARARGS}, |
|
{"search", (PyCFunction)regobj_search, METH_VARARGS}, |
|
{"group", (PyCFunction)regobj_group, METH_VARARGS}, |
|
{NULL, NULL} /* sentinel */ |
|
}; |
|
|
|
|
|
|
|
static char* members[] = { |
|
"last", "regs", "translate", |
|
"groupindex", "realpat", "givenpat", |
|
NULL |
|
}; |
|
|
|
|
|
static PyObject * |
|
regobj_getattr(regexobject *re, char *name) |
|
{ |
|
if (strcmp(name, "regs") == 0) { |
|
if (re->re_lastok == NULL) { |
|
Py_INCREF(Py_None); |
|
return Py_None; |
|
} |
|
return makeresult(&re->re_regs); |
|
} |
|
if (strcmp(name, "last") == 0) { |
|
if (re->re_lastok == NULL) { |
|
Py_INCREF(Py_None); |
|
return Py_None; |
|
} |
|
Py_INCREF(re->re_lastok); |
|
return re->re_lastok; |
|
} |
|
if (strcmp(name, "translate") == 0) { |
|
if (re->re_translate == NULL) { |
|
Py_INCREF(Py_None); |
|
return Py_None; |
|
} |
|
Py_INCREF(re->re_translate); |
|
return re->re_translate; |
|
} |
|
if (strcmp(name, "groupindex") == 0) { |
|
if (re->re_groupindex == NULL) { |
|
Py_INCREF(Py_None); |
|
return Py_None; |
|
} |
|
Py_INCREF(re->re_groupindex); |
|
return re->re_groupindex; |
|
} |
|
if (strcmp(name, "realpat") == 0) { |
|
if (re->re_realpat == NULL) { |
|
Py_INCREF(Py_None); |
|
return Py_None; |
|
} |
|
Py_INCREF(re->re_realpat); |
|
return re->re_realpat; |
|
} |
|
if (strcmp(name, "givenpat") == 0) { |
|
if (re->re_givenpat == NULL) { |
|
Py_INCREF(Py_None); |
|
return Py_None; |
|
} |
|
Py_INCREF(re->re_givenpat); |
|
return re->re_givenpat; |
|
} |
|
if (strcmp(name, "__members__") == 0) { |
|
int i = 0; |
|
PyObject *list = NULL; |
|
|
|
/* okay, so it's unlikely this list will change that often. |
|
still, it's easier to change it in just one place. |
|
*/ |
|
while (members[i]) |
|
i++; |
|
if (!(list = PyList_New(i))) |
|
return NULL; |
|
|
|
i = 0; |
|
while (members[i]) { |
|
PyObject* v = PyString_FromString(members[i]); |
|
if (!v || PyList_SetItem(list, i, v) < 0) { |
|
Py_DECREF(list); |
|
return NULL; |
|
} |
|
i++; |
|
} |
|
return list; |
|
} |
|
return Py_FindMethod(reg_methods, (PyObject *)re, name); |
|
} |
|
|
|
static PyTypeObject Regextype = { |
|
PyObject_HEAD_INIT(NULL) |
|
0, /*ob_size*/ |
|
"regex.regex", /*tp_name*/ |
|
sizeof(regexobject), /*tp_size*/ |
|
0, /*tp_itemsize*/ |
|
/* methods */ |
|
(destructor)reg_dealloc, /*tp_dealloc*/ |
|
0, /*tp_print*/ |
|
(getattrfunc)regobj_getattr, /*tp_getattr*/ |
|
0, /*tp_setattr*/ |
|
0, /*tp_compare*/ |
|
0, /*tp_repr*/ |
|
}; |
|
|
|
/* reference counting invariants: |
|
pattern: borrowed |
|
translate: borrowed |
|
givenpat: borrowed |
|
groupindex: transferred |
|
*/ |
|
static PyObject * |
|
newregexobject(PyObject *pattern, PyObject *translate, PyObject *givenpat, PyObject *groupindex) |
|
{ |
|
regexobject *re; |
|
char *pat; |
|
int size; |
|
|
|
if (!PyArg_Parse(pattern, "t#", &pat, &size)) |
|
return NULL; |
|
|
|
if (translate != NULL && PyString_Size(translate) != 256) { |
|
PyErr_SetString(RegexError, |
|
"translation table must be 256 bytes"); |
|
return NULL; |
|
} |
|
re = PyObject_New(regexobject, &Regextype); |
|
if (re != NULL) { |
|
char *error; |
|
re->re_patbuf.buffer = NULL; |
|
re->re_patbuf.allocated = 0; |
|
re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap; |
|
if (translate) { |
|
re->re_patbuf.translate = (unsigned char *)PyString_AsString(translate); |
|
if (!re->re_patbuf.translate) |
|
goto finally; |
|
Py_INCREF(translate); |
|
} |
|
else |
|
re->re_patbuf.translate = NULL; |
|
re->re_translate = translate; |
|
re->re_lastok = NULL; |
|
re->re_groupindex = groupindex; |
|
Py_INCREF(pattern); |
|
re->re_realpat = pattern; |
|
Py_INCREF(givenpat); |
|
re->re_givenpat = givenpat; |
|
error = _Py_re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf); |
|
if (error != NULL) { |
|
PyErr_SetString(RegexError, error); |
|
goto finally; |
|
} |
|
} |
|
return (PyObject *)re; |
|
finally: |
|
Py_DECREF(re); |
|
return NULL; |
|
} |
|
|
|
static PyObject * |
|
regex_compile(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *pat = NULL; |
|
PyObject *tran = NULL; |
|
|
|
if (!PyArg_ParseTuple(args, "S|S:compile", &pat, &tran)) |
|
return NULL; |
|
return newregexobject(pat, tran, pat, NULL); |
|
} |
|
|
|
static PyObject * |
|
symcomp(PyObject *pattern, PyObject *gdict) |
|
{ |
|
char *opat, *oend, *o, *n, *g, *v; |
|
int group_count = 0; |
|
int sz; |
|
int escaped = 0; |
|
char name_buf[128]; |
|
PyObject *npattern; |
|
int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1; |
|
|
|
if (!(opat = PyString_AsString(pattern))) |
|
return NULL; |
|
|
|
if ((sz = PyString_Size(pattern)) < 0) |
|
return NULL; |
|
|
|
oend = opat + sz; |
|
o = opat; |
|
|
|
if (oend == opat) { |
|
Py_INCREF(pattern); |
|
return pattern; |
|
} |
|
|
|
if (!(npattern = PyString_FromStringAndSize((char*)NULL, sz)) || |
|
!(n = PyString_AsString(npattern))) |
|
return NULL; |
|
|
|
while (o < oend) { |
|
if (*o == '(' && escaped == require_escape) { |
|
char *backtrack; |
|
escaped = 0; |
|
++group_count; |
|
*n++ = *o; |
|
if (++o >= oend || *o != '<') |
|
continue; |
|
/* *o == '<' */ |
|
if (o+1 < oend && *(o+1) == '>') |
|
continue; |
|
backtrack = o; |
|
g = name_buf; |
|
for (++o; o < oend;) { |
|
if (*o == '>') { |
|
PyObject *group_name = NULL; |
|
PyObject *group_index = NULL; |
|
*g++ = '\0'; |
|
group_name = PyString_FromString(name_buf); |
|
group_index = PyInt_FromLong(group_count); |
|
if (group_name == NULL || |
|
group_index == NULL || |
|
PyDict_SetItem(gdict, group_name, |
|
group_index) != 0) |
|
{ |
|
Py_XDECREF(group_name); |
|
Py_XDECREF(group_index); |
|
Py_XDECREF(npattern); |
|
return NULL; |
|
} |
|
Py_DECREF(group_name); |
|
Py_DECREF(group_index); |
|
++o; /* eat the '>' */ |
|
break; |
|
} |
|
if (!isalnum(Py_CHARMASK(*o)) && *o != '_') { |
|
o = backtrack; |
|
break; |
|
} |
|
*g++ = *o++; |
|
} |
|
} |
|
else if (*o == '[' && !escaped) { |
|
*n++ = *o; |
|
++o; /* eat the char following '[' */ |
|
*n++ = *o; |
|
while (o < oend && *o != ']') { |
|
++o; |
|
*n++ = *o; |
|
} |
|
if (o < oend) |
|
++o; |
|
} |
|
else if (*o == '\\') { |
|
escaped = 1; |
|
*n++ = *o; |
|
++o; |
|
} |
|
else { |
|
escaped = 0; |
|
*n++ = *o; |
|
++o; |
|
} |
|
} |
|
|
|
if (!(v = PyString_AsString(npattern))) { |
|
Py_DECREF(npattern); |
|
return NULL; |
|
} |
|
/* _PyString_Resize() decrements npattern on failure */ |
|
_PyString_Resize(&npattern, n - v); |
|
return npattern; |
|
|
|
} |
|
|
|
static PyObject * |
|
regex_symcomp(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *pattern; |
|
PyObject *tran = NULL; |
|
PyObject *gdict = NULL; |
|
PyObject *npattern; |
|
PyObject *retval = NULL; |
|
|
|
if (!PyArg_ParseTuple(args, "S|S:symcomp", &pattern, &tran)) |
|
return NULL; |
|
|
|
gdict = PyDict_New(); |
|
if (gdict == NULL || (npattern = symcomp(pattern, gdict)) == NULL) { |
|
Py_DECREF(gdict); |
|
Py_DECREF(pattern); |
|
return NULL; |
|
} |
|
retval = newregexobject(npattern, tran, pattern, gdict); |
|
Py_DECREF(npattern); |
|
return retval; |
|
} |
|
|
|
|
|
static PyObject *cache_pat; |
|
static PyObject *cache_prog; |
|
|
|
static int |
|
update_cache(PyObject *pat) |
|
{ |
|
PyObject *tuple = Py_BuildValue("(O)", pat); |
|
int status = 0; |
|
|
|
if (!tuple) |
|
return -1; |
|
|
|
if (pat != cache_pat) { |
|
Py_XDECREF(cache_pat); |
|
cache_pat = NULL; |
|
Py_XDECREF(cache_prog); |
|
cache_prog = regex_compile((PyObject *)NULL, tuple); |
|
if (cache_prog == NULL) { |
|
status = -1; |
|
goto finally; |
|
} |
|
cache_pat = pat; |
|
Py_INCREF(cache_pat); |
|
} |
|
finally: |
|
Py_DECREF(tuple); |
|
return status; |
|
} |
|
|
|
static PyObject * |
|
regex_match(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *pat, *string; |
|
PyObject *tuple, *v; |
|
|
|
if (!PyArg_ParseTuple(args, "SS:match", &pat, &string)) |
|
return NULL; |
|
if (update_cache(pat) < 0) |
|
return NULL; |
|
|
|
if (!(tuple = Py_BuildValue("(S)", string))) |
|
return NULL; |
|
v = regobj_match((regexobject *)cache_prog, tuple); |
|
Py_DECREF(tuple); |
|
return v; |
|
} |
|
|
|
static PyObject * |
|
regex_search(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *pat, *string; |
|
PyObject *tuple, *v; |
|
|
|
if (!PyArg_ParseTuple(args, "SS:search", &pat, &string)) |
|
return NULL; |
|
if (update_cache(pat) < 0) |
|
return NULL; |
|
|
|
if (!(tuple = Py_BuildValue("(S)", string))) |
|
return NULL; |
|
v = regobj_search((regexobject *)cache_prog, tuple); |
|
Py_DECREF(tuple); |
|
return v; |
|
} |
|
|
|
static PyObject * |
|
regex_set_syntax(PyObject *self, PyObject *args) |
|
{ |
|
int syntax; |
|
if (!PyArg_ParseTuple(args, "i:set_syntax", &syntax)) |
|
return NULL; |
|
syntax = re_set_syntax(syntax); |
|
/* wipe the global pattern cache */ |
|
Py_XDECREF(cache_pat); |
|
cache_pat = NULL; |
|
Py_XDECREF(cache_prog); |
|
cache_prog = NULL; |
|
return PyInt_FromLong((long)syntax); |
|
} |
|
|
|
static PyObject * |
|
regex_get_syntax(PyObject *self) |
|
{ |
|
return PyInt_FromLong((long)re_syntax); |
|
} |
|
|
|
|
|
static struct PyMethodDef regex_global_methods[] = { |
|
{"compile", regex_compile, METH_VARARGS}, |
|
{"symcomp", regex_symcomp, METH_VARARGS}, |
|
{"match", regex_match, METH_VARARGS}, |
|
{"search", regex_search, METH_VARARGS}, |
|
{"set_syntax", regex_set_syntax, METH_VARARGS}, |
|
{"get_syntax", (PyCFunction)regex_get_syntax, METH_NOARGS}, |
|
{NULL, NULL} /* sentinel */ |
|
}; |
|
|
|
PyMODINIT_FUNC |
|
initregex(void) |
|
{ |
|
PyObject *m, *d, *v; |
|
int i; |
|
char *s; |
|
|
|
/* Initialize object type */ |
|
Regextype.ob_type = &PyType_Type; |
|
|
|
m = Py_InitModule("regex", regex_global_methods); |
|
d = PyModule_GetDict(m); |
|
|
|
if (PyErr_Warn(PyExc_DeprecationWarning, |
|
"the regex module is deprecated; " |
|
"please use the re module") < 0) |
|
return; |
|
|
|
/* Initialize regex.error exception */ |
|
v = RegexError = PyErr_NewException("regex.error", NULL, NULL); |
|
if (v == NULL || PyDict_SetItemString(d, "error", v) != 0) |
|
goto finally; |
|
|
|
/* Initialize regex.casefold constant */ |
|
if (!(v = PyString_FromStringAndSize((char *)NULL, 256))) |
|
goto finally; |
|
|
|
if (!(s = PyString_AsString(v))) |
|
goto finally; |
|
|
|
for (i = 0; i < 256; i++) { |
|
if (isupper(i)) |
|
s[i] = tolower(i); |
|
else |
|
s[i] = i; |
|
} |
|
if (PyDict_SetItemString(d, "casefold", v) < 0) |
|
goto finally; |
|
Py_DECREF(v); |
|
|
|
if (!PyErr_Occurred()) |
|
return; |
|
finally: |
|
/* Nothing */ ; |
|
}
|
|
|