From 61a563ec4484df103182afd0b5f49e1f91d3454d Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 27 Jun 2013 16:40:50 +0200 Subject: [PATCH] HTMLParser more forgiving with bad attributes The idea is not to fix, but at least passthru as text. --- src/html_parser.cpp | 80 ++++++++++++++++++++++++---------------------- src/test_html_parser.cpp | 60 ++++++++++++++++++---------------- 2 files changed, 75 insertions(+), 65 deletions(-) diff --git a/src/html_parser.cpp b/src/html_parser.cpp index ddbbbe0..e704620 100644 --- a/src/html_parser.cpp +++ b/src/html_parser.cpp @@ -102,38 +102,39 @@ int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event, const char **value, int *val_len, int *tr) { + int v0, v1; int i = skipName(cp); *attr_len = i; *value = NULL; if (!i) return skipSpace(cp); i += skipSpace(cp + i); - if (cp[i] == '=') + if (cp[i] != '=') + return 0; + + i++; + i += skipSpace(cp + i); + if (cp[i] == '\"' || cp[i] == '\'') { - int v0, v1; - i++; - i += skipSpace(cp + i); - if (cp[i] == '\"' || cp[i] == '\'') - { - *tr = cp[i]; - v0 = ++i; - while (cp[i] != *tr && cp[i]) - i++; - v1 = i; - if (cp[i]) - i++; - } - else - { - *tr = 0; - v0 = i; - while (cp[i] && !strchr(SPACECHR ">", cp[i])) - i++; - v1 = i; - } - *value = cp + v0; - *val_len = v1 - v0; + *tr = cp[i]; + v0 = ++i; + while (cp[i] != *tr && cp[i]) + i++; + v1 = i; + if (cp[i]) + i++; + } + else + { + *tr = 0; + v0 = i; + while (cp[i] && !strchr(SPACECHR ">", cp[i])) + i++; + v1 = i; } + *value = cp + v0; + *val_len = v1 - v0; + i += skipSpace(cp + i); return i; } @@ -150,22 +151,18 @@ int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event, const char *value; int val_len; int tr; + char x[2]; int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr); + if (!nor) + break; i += nor; - if (nor) - { - char x[2]; - x[0] = tr; - x[1] = 0; - if (m_verbose) - printf ("------ attr %.*s=%.*s\n", attr_len, attr_name, - val_len, value); - event.attribute(name, len, attr_name, attr_len, value, val_len, x); - } - else - { - i++; - } + + x[0] = tr; + x[1] = 0; + if (m_verbose) + printf ("------ attr %.*s=%.*s\n", attr_len, attr_name, + val_len, value); + event.attribute(name, len, attr_name, attr_len, value, val_len, x); } return i; } @@ -222,7 +219,11 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, for (; cp[i] && cp[i] != '/' && cp[i] != '>'; i++) ; if (i > 0) + { + if (m_verbose) + printf("------ text %.*s\n", i, cp); event.text(cp, i); + } if (cp[i] == '/') { close_it = 1; @@ -230,6 +231,9 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, } if (cp[i] == '>') { + if (m_verbose) + printf("------ any tag %s %.*s\n", + close_it ? " close" : "end", tag_len, tag); event.anyTagEnd(tag, tag_len, close_it); i++; } diff --git a/src/test_html_parser.cpp b/src/test_html_parser.cpp index 6f5c134..baf42bc 100644 --- a/src/test_html_parser.cpp +++ b/src/test_html_parser.cpp @@ -80,14 +80,17 @@ BOOST_AUTO_TEST_CASE( test_html_parser_1 ) "some text" "
"; MyEvent e; - hp.set_verbose(1); + hp.set_verbose(0); hp.parse(e, html); - std::cout << "Expected" << std::endl; - std::cout << expected << std::endl; - std::cout << "Got" << std::endl; - std::cout << e.out << std::endl; BOOST_CHECK_EQUAL(std::string(expected), e.out); + if (std::string(expected) != e.out) + { + std::cout << "Expected" << std::endl; + std::cout << expected << std::endl; + std::cout << "Got" << std::endl; + std::cout << e.out << std::endl; + } } catch (std::exception & e) { @@ -117,15 +120,17 @@ BOOST_AUTO_TEST_CASE( test_html_parser_2 ) const char* expected = html; MyEvent e; - hp.set_verbose(1); + hp.set_verbose(0); hp.parse(e, html); - std::cout << "Expected" << std::endl; - std::cout << expected << std::endl; - std::cout << "Got" << std::endl; - std::cout << e.out << std::endl; - BOOST_CHECK_EQUAL(std::string(expected), e.out); + if (std::string(expected) != e.out) + { + std::cout << "Expected" << std::endl; + std::cout << expected << std::endl; + std::cout << "Got" << std::endl; + std::cout << e.out << std::endl; + } } catch (std::exception & e) { @@ -152,15 +157,17 @@ BOOST_AUTO_TEST_CASE( test_html_parser_3 ) const char* expected = html; MyEvent e; - hp.set_verbose(1); + hp.set_verbose(0); hp.parse(e, html); - std::cout << "Expected" << std::endl; - std::cout << expected << std::endl; - std::cout << "Got" << std::endl; - std::cout << e.out << std::endl; - BOOST_CHECK_EQUAL(std::string(expected), e.out); + if (std::string(expected) != e.out) + { + std::cout << "Expected" << std::endl; + std::cout << expected << std::endl; + std::cout << "Got" << std::endl; + std::cout << e.out << std::endl; + } } catch (std::exception & e) { @@ -170,28 +177,28 @@ BOOST_AUTO_TEST_CASE( test_html_parser_3 ) } } -#if 0 -// null ptr exception BOOST_AUTO_TEST_CASE( test_html_parser_4 ) { try { mp::HTMLParser hp; const char* html = - "<\"?xml version=\"1.0\" strandalone=\"no\"?>\n" - ""; + "<\"?xml version=\"1.0\" strandalone=\"no\"? ax>\n" + ""; // does not work const char* expected = html; MyEvent e; hp.set_verbose(1); hp.parse(e, html); - std::cout << "Expected" << std::endl; - std::cout << expected << std::endl; - std::cout << "Got" << std::endl; - std::cout << e.out << std::endl; - BOOST_CHECK_EQUAL(std::string(expected), e.out); + if (std::string(expected) != e.out) + { + std::cout << "Expected" << std::endl; + std::cout << expected << std::endl; + std::cout << "Got" << std::endl; + std::cout << e.out << std::endl; + } } catch (std::exception & e) { @@ -200,7 +207,6 @@ BOOST_AUTO_TEST_CASE( test_html_parser_4 ) BOOST_CHECK (false); } } -#endif /* * Local variables: -- 1.7.10.4