|
Home > Archive > Unix Programming > January 2004 > regex
You are viewing an archived Text-only version of the thread.
To view this thread in it's original format and/or if you want to reply to
this thread please [click here]
|
|
| Greg Martin 2004-01-23, 5:35 pm |
| RFC 2396 on URI's gives the regular expression stored in the string regex
in the code below. The expression compiles successfully but produces no
matches. I'm afraid I know little about regular expressions. I'd appreciate
any help - an explanation for why there is no match here, the corrected
expression or resources for developing a good understanding of regex and
regular expressions.
Thanks.
//
// gcc -Wall regextest.cpp -o regextest -lstdc++ -lm
//
#include <sys/types.h>
#include <regex.h>
#include <iostream>
#include <string>
int main()
{
std::string regex =
"^(([^:/?#]+) ?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?";
std::string uri = "http://www.ics.uci.edu/pub/ietf/uri/#Related";
regex_t preg;
int rv = regcomp(&preg, regex.c_str(), 0);
if(rv != 0)
{
size_t sz = 128;
resize1:
char *errb = new char[sz];
size_t bs = regerror(rv, &preg, errb, sz);
if(bs < sz)
std::cout << errb << "\n";
else
{
sz = bs;
delete errb;
goto resize1;
}
delete errb;
exit(0);
}
std::cout << "Compiled successfully\n";
regmatch_t pmatch[9];
rv = regexec(&preg, uri.c_str(), 9, pmatch, 0);
if(rv != 0)
{
size_t sz = 128;
resize2:
char *errb = new char[sz];
size_t bs = regerror(rv, &preg, errb, 256);
if(bs < sz)
std::cout << errb << "\n";
else
{
sz = bs;
delete errb;
goto resize2;
}
delete errb;
regfree(&preg);
exit(0);
}
std::cout << "Executed successfully\n";
for(int i = 0; i < 9; ++i)
{
std::cout << i << " : ";
std::cout << pmatch[i].rm_so << " x ";
std::cout << pmatch[i].rm_eo << "\n";
}
regfree(&preg);
return 0;
}
--
Greg Martin
gregmar found at telusplanet on the net
| |
| joe@invalid.address 2004-01-23, 5:35 pm |
| Greg Martin <gregmar@telusplanet.net> writes:
quote:
> RFC 2396 on URI's gives the regular expression stored in the string regex
> in the code below. The expression compiles successfully but produces no
> matches. I'm afraid I know little about regular expressions. I'd appreciate
> any help - an explanation for why there is no match here, the corrected
> expression or resources for developing a good understanding of regex and
> regular expressions.
> Thanks.
> //
> // gcc -Wall regextest.cpp -o regextest -lstdc++ -lm
> //
> #include <sys/types.h>
> #include <regex.h>
>
> #include <iostream>
> #include <string>
>
> int main()
> {
> std::string regex =
> "^(([^:/?#]+) ?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?";
This should work with perl, but in C/C++ you need to quote the
backslash. Try it with
"^(([^:/?#]+) ?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
Joe
| |
| Mike Chirico 2004-01-23, 5:35 pm |
|
"Greg Martin" <gregmar@telusplanet.net> wrote in message
news:AOyMb.12158$De.11896@edtnps84...quote:
> RFC 2396 on URI's gives the regular expression stored in the string regex
> in the code below. The expression compiles successfully but produces no
> matches. I'm afraid I know little about regular expressions. I'd
appreciatequote:
> any help - an explanation for why there is no match here, the corrected
> expression or resources for developing a good understanding of regex and
> regular expressions.
> Thanks.
> //
> // gcc -Wall regextest.cpp -o regextest -lstdc++ -lm
> //
> #include <sys/types.h>
> #include <regex.h>
>
> #include <iostream>
> #include <string>
>
> int main()
> {
> std::string regex =
> "^(([^:/?#]+) ?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?";
> std::string uri = "http://www.ics.uci.edu/pub/ietf/uri/#Related";
>
> regex_t preg;
[ snip]
Here's a C sample, that may help:
#include <stdio.h>
#include <sys/types.h>
#include <string.h>
#include <regex.h>
/ ****************************************
***************
gcc -o regexp regexp.c
./regexp '([a|c|e|g]{2}|[h-z])([0-9]|-)(a|b)' cc8b
match
4 0
$0 = cc8b, preg.re_nsub = 3
2 0
$1 = cc, preg.re_nsub = 3
3 2
$2 = 8, preg.re_nsub = 3
4 3
$3 = b, preg.re_nsub = 3
A html copy of this program can be found at
http://souptonuts.sourceforge.net/code/C_regexp.c.html
****************************************
***************/
#define NUM_MATCHES 4//max sub-matches
int
main (int argc, char *argv[])
{
regex_t preg;
regmatch_t pmatch[NUM_MATCHES];
size_t rm, i;
char pom[1024];
if (argc < 2)
{
printf ("Usage ./regexp [a-z]{2}[0-9]{2} a2b3ed23\n");
return 0;
}
rm = regcomp (&preg, argv[1], REG_EXTENDED);
(rm =
regexec (&preg, argv[2], NUM_MATCHES, pmatch,
0)) ? printf ("No match\n") : printf ("match\n");
for (i = 0; !rm && i <= preg.re_nsub; i++)
{
strncpy (pom, argv[2] + pmatch[i].rm_so,
pmatch[i].rm_eo - pmatch[i].rm_so);
printf ("%d %d\n", pmatch[i].rm_eo, pmatch[i].rm_so);
pom[pmatch[i].rm_eo - pmatch[i].rm_so] = '\0';
printf ("$%d = %s, preg.re_nsub = %d\n", i, pom, preg.re_nsub);
}
regfree (&preg);
return 0;
}
Regards,
Mike Chirico
| |
| Greg Martin 2004-01-23, 5:35 pm |
| joe@invalid.address wrote:
quote:
> Greg Martin <gregmar@telusplanet.net> writes:
>
>
> This should work with perl, but in C/C++ you need to quote the
> backslash. Try it with
>
> "^(([^:/?#]+) ?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
I tried that but it still doesn't produce a match for me. I tried with
REG_EXTENDED as well - still no match.
--
Greg Martin
gregmar found at telusplanet on the net
| |
| joe@invalid.address 2004-01-23, 5:35 pm |
| Greg Martin <gregmar@telusplanet.net> writes:
quote:
> joe@invalid.address wrote:
>
>
> I tried that but it still doesn't produce a match for me. I tried with
> REG_EXTENDED as well - still no match.
Odd, I tried it with the Sun Workshop 6.x CC and it worked fine for
me. It doesn't work for me with gcc or g++ (why are you compiling a
C++ program with gcc rather than g++?).
I've got gcc version:
$ g++ --version
g++ (GCC) 3.2.2 20030222 (Red Hat Linux 3.2.2-5)
Copyright (C) 2002 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
It may be a glibc problem. The RE looks ok to me.
Joe
| |
| John W. Krahn 2004-01-23, 5:35 pm |
| Greg Martin wrote:quote:
>
> RFC 2396 on URI's gives the regular expression stored in the string regex
> in the code below. The expression compiles successfully but produces no
> matches. I'm afraid I know little about regular expressions. I'd appreciate
> any help - an explanation for why there is no match here, the corrected
> expression
Well it works in PERL but I don't know enough about C++ to see what is wrong.
$ PERL -le'
my $preg = qr"^(([^:/?#]+) ?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?";
my $uri = "http://www.ics.uci.edu/pub/ietf/uri/#Related";
my @matches = $uri =~ $preg;
print for @matches;
'
http:
http
//www.ics.uci.edu
www.ics.uci.edu
/pub/ietf/uri/
#Related
Related
quote:
> or resources for developing a good understanding of regex and
> regular expressions.
For a good understanding of regular expressions get the book "Mastering Regular Expressions".
http://www.oreilly.com/catalog/regex2/index.html
John
--
use Perl;
program
fulfillment
| |
| Valentin Nechayev 2004-01-23, 5:35 pm |
| >>> Greg Martin wrote:
[QUOTE][color=darkred]
Yes, this is important: as it is string source used by C compiler,
\\ is translated to single backslash in resulting code.
GM> I tried that but it still doesn't produce a match for me. I tried with
GM> REG_EXTENDED as well - still no match.
You should try REG_EXTENDED from the very beginning. This regexp is
extended POSIX regexp, not basic. Also, PERL regexps are extensions
of extended POSIX regexps, not basic POSIX regexps.
I tried to run your program on FreeBSD 4.9 (regexp routines from libc).
It runs successfully with change of single backslash to double in C string
source. I can't imagine what another problem you met. Show regerror() output.
-netch-
| |
| Greg Martin 2004-01-23, 5:35 pm |
| joe@invalid.address wrote:
quote:
> Greg Martin <gregmar@telusplanet.net> writes:
>
>
> Odd, I tried it with the Sun Workshop 6.x CC and it worked fine for
> me. It doesn't work for me with gcc or g++ (why are you compiling a
> C++ program with gcc rather than g++?).
>
Just habit (gcc runs the C++ compiler any way).
der if ther
--
Greg Martin
gregmar found at telusplanet on the net
| |
| Greg Martin 2004-01-23, 5:35 pm |
| Valentin Nechayev wrote:
quote:
>
>
> Yes, this is important: as it is string source used by C compiler,
> \\ is translated to single backslash in resulting code.
>
> GM> I tried that but it still doesn't produce a match for me. I tried with
> GM> REG_EXTENDED as well - still no match.
>
> You should try REG_EXTENDED from the very beginning. This regexp is
> extended POSIX regexp, not basic. Also, PERL regexps are extensions
> of extended POSIX regexps, not basic POSIX regexps.
>
> I tried to run your program on FreeBSD 4.9 (regexp routines from libc).
> It runs successfully with change of single backslash to double in C string
> source. I can't imagine what another problem you met. Show regerror()
> output.
>
>
"No match" is all it returns. I used Debain. I'll try it on some other
platforms.
--
Greg Martin
gregmar found at telusplanet on the net
| |
| Espen Myrland 2004-01-23, 5:35 pm |
| Greg Martin <gregmar@telusplanet.net> writes:
quote:
>
> "No match" is all it returns. I used Debain. I'll try it on some other
> platforms.
>
With both the above additions, it works here on Debian, gcc 2.95.4,
libc-2.3.2
--
/espen
| |
| joe@invalid.address 2004-01-23, 5:35 pm |
| Greg Martin <gregmar@telusplanet.net> writes:
quote:
> Valentin Nechayev wrote:
>
>
> "No match" is all it returns. I used Debain. I'll try it on some other
> platforms.
It works using REG_EXTENDED with g++ 3.2.2 on RedHat 9 (although I
doubt the distribution changes the compiler).
Joe
| |
| Greg Martin 2004-01-23, 5:35 pm |
| Espen Myrland wrote:
quote:
> Greg Martin <gregmar@telusplanet.net> writes:
>
>
>
>
> With both the above additions, it works here on Debian, gcc 2.95.4,
> libc-2.3.2
>
>
With REG_EXTENDED and escaping the back slash I get a match in the first
position for an offset of 4 x 5 but n matches for the rest. According to
the RFC there should be matches for seven of the positions, I believe.
Compiled successfully
Executed successfully
0 : 4 x 5
1 : -1 x -1
2 : -1 x -1
3 : -1 x -1
4 : -1 x -1
5 : -1 x -1
6 : -1 x -1
7 : -1 x -1
8 : -1 x -1
#include <sys/types.h>
#include <regex.h>
#include <iostream>
#include <string>
int main()
{
std::string regex =
"^(([^:/?#]+) ?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
std::string uri = "http://www.ics.uci.edu/pub/ietf/uri/#Related";
regex_t preg;
int rv = regcomp(&preg, regex.c_str(), REG_EXTENDED);
if(rv != 0)
{
size_t sz = 128;
resize1:
char *errb = new char[sz];
size_t bs = regerror(rv, &preg, errb, sz);
if(bs < sz)
std::cout << errb << "\n";
else
{
sz = bs;
delete errb;
goto resize1;
}
delete errb;
exit(0);
}
std::cout << "Compiled successfully\n";
regmatch_t pmatch[9];
rv = regexec(&preg, uri.c_str(), 9, pmatch, 0);
if(rv != 0)
{
size_t sz = 128;
resize2:
char *errb = new char[sz];
size_t bs = regerror(rv, &preg, errb, 256);
if(bs < sz)
std::cout << errb << "\n";
else
{
sz = bs;
delete errb;
goto resize2;
}
delete errb;
regfree(&preg);
exit(0);
}
std::cout << "Executed successfully\n";
for(int i = 0; i < 9; ++i)
{
std::cout << i << " : ";
std::cout << pmatch[i].rm_so << " x ";
std::cout << pmatch[i].rm_eo << "\n";
}
regfree(&preg);
return 0;
}
--
Greg Martin
gregmar found at telusplanet on the net
| |
| Espen Myrland 2004-01-23, 5:36 pm |
| Greg Martin <gregmar@telusplanet.net> writes:
quote:
> Espen Myrland wrote:
>
> With REG_EXTENDED and escaping the back slash I get a match in the first
> position for an offset of 4 x 5 but n matches for the rest. According to
> the RFC there should be matches for seven of the positions, I believe.
>
> Compiled successfully
> Executed successfully
> 0 : 4 x 5
> 1 : -1 x -1
> 2 : -1 x -1
> 3 : -1 x -1
> 4 : -1 x -1
> 5 : -1 x -1
> 6 : -1 x -1
> 7 : -1 x -1
> 8 : -1 x -1
>
This is strange. I get seven matches and they look
reasonable:
0 : 0 x 44
1 : 0 x 5
2 : 0 x 4
3 : 5 x 22
4 : 7 x 22
5 : 22 x 36
6 : -1 x -1
7 : -1 x -1
8 : 36 x 44
--
espen
|
|
|
|
|