> maybe in applications where minor parsing/text validation tasks are more peripheral then regexs are more appropriate?
Definitely. There's a 3-line regex-based tokenizer in one of my side projects (in python). Today, for kicks, I tried writing the same function without regexes using two different approaches. One uses list comprehensions and makes liberal use of python string, set, and list methods, the other is a single-pass ad hoc state machine.
The regex function is 3 lines and parses 1,000,000 characters in 0.3 seconds on my desktop.
The split-strip function is 20 lines, imo very hard to read, and far slower than the regex (1.77 seconds on the same benchmark).
The adhoc state machine is 89 lines and easily the slowest (5.3 seconds on the million byte benchmark.
I'm pretty sure that even if I moved this app towards production I would polish the regex function rather than swap it out entirely with one of the other approaches.
key_sections_regex=re.compile(r'\(([\-+]?[0-9]+,\s*\S+)\):([\s\-0-9]+)', re.S)
def split_keyed_line(keyedline):
return [[k, n.strip()] for k, n in re.findall(key_sections_regex, keyedline)]
def split_keyed_line_splits(keyedline):
whitespace = [' ', '\t', '\n', '\r', '\f', '\v']
notechars=set(list('0123456789-') + whitespace)
offsetchars=set('0123456789-+')
def splitka(ka):
off, mode = ka.split(',')
if set(off).issubset(offsetchars):
if len(set(mode.strip()).intersection(whitespace)) == 0:
return ','.join([off, mode])
raise Exception
def validnotes(notestr):
if notestr.startswith(':'):
if set(notestr[1:]).issubset(notechars):
return notestr[1:].strip()
else:
raise Exception(notestr)
return [[splitka(keyarea), validnotes(notearea.strip())]
for keyarea, notearea in
[x.split(')') for x in keyedline.split('(') if x != '']]
def split_keyed_line_adhoc(keyedline):
whitespace = [' ', '\t', '\n', '\r', '\f', '\v']
digits=list('0123456789')
keyarea_begin = '('
keyarea_end = ')'
keyarea_split = ','
keyarea_offset_signs = ['-', '+']
keyarea_offset_values = digits
notearea_values = digits + whitespace + ['-']
tokens=[]
states=['start', 'offsetsign', 'offset', 'premode',
'mode', 'transition', 'notes', 'accept']
state='start'
current_token=''
subtoken=[]
for ch in keyedline:
if state == 'start':
if ch in whitespace:
continue
if ch == '(':
state = 'offsetsign'
continue
if ch not in whitespace:
print "Syntax Error"
raise Exception
elif state == 'offsetsign':
if ch in keyarea_offset_signs:
current_token += ch
elif ch in keyarea_offset_values:
current_token += ch
else:
print "Syntax Error"
raise Exception
state='offset'
continue
elif state == 'offset':
if ch in keyarea_offset_values:
current_token += ch
elif ch == keyarea_split:
current_token += ch
state='premode'
continue
elif state == 'premode':
if ch in whitespace:
continue
elif ch not in whitespace:
current_token += ch
state = 'mode'
continue
elif state == 'mode':
if ch == keyarea_end:
subtoken.append(current_token)
current_token=''
state='transition'
continue
elif ch in whitespace:
raise Exception
elif ch not in whitespace:
current_token += ch
continue
elif state == 'transition':
if ch == ':':
state='notes'
continue
else:
raise Exception
elif state == 'notes':
if ch in notearea_values:
current_token += ch
continue
elif ch == keyarea_begin:
subtoken.append(current_token.strip())
tokens.append(subtoken)
current_token=''
subtoken=[]
state='offsetsign'
continue
if state == 'notes':
subtoken.append(current_token.strip())
tokens.append(subtoken)
current_token=''
subtoken=[]
current_token=''
state = 'accept'
elif state == 'start':
state = 'accept'
if state == 'accept':
return tokens
else:
return ('error', state, tokens)
Definitely. There's a 3-line regex-based tokenizer in one of my side projects (in python). Today, for kicks, I tried writing the same function without regexes using two different approaches. One uses list comprehensions and makes liberal use of python string, set, and list methods, the other is a single-pass ad hoc state machine.
The regex function is 3 lines and parses 1,000,000 characters in 0.3 seconds on my desktop.
The split-strip function is 20 lines, imo very hard to read, and far slower than the regex (1.77 seconds on the same benchmark).
The adhoc state machine is 89 lines and easily the slowest (5.3 seconds on the million byte benchmark.
I'm pretty sure that even if I moved this app towards production I would polish the regex function rather than swap it out entirely with one of the other approaches.