1 /* This file is part of Metaproxy.
2 Copyright (C) 2005-2013 Index Data
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include "html_parser.hpp"
28 #define TAG_MAX_LEN 64
30 #define SPACECHR " \t\r\n\f"
35 #include <sys/types.h>
38 namespace mp = metaproxy_1;
40 mp::HTMLParser::HTMLParser()
44 mp::HTMLParser::~HTMLParser()
48 static void parse_str(mp::HTMLParserEvent & event, const char * str);
50 void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
52 parse_str(event, str);
55 //static C functions follow would probably make sense to wrap this in PIMPL?
57 static char* dupe(const char *buff, int len)
59 char *value = (char *) malloc(len + 1);
61 memcpy(value, buff, len);
66 static int skipSpace(const char *cp)
69 while (cp[i] && strchr(SPACECHR, cp[i]))
74 static int skipName(const char *cp, char *dst)
78 for (i=0; cp[i] && !strchr(SPACECHR "/>=", cp[i]); i++)
79 if (j < TAG_MAX_LEN-1)
81 dst[j] = tolower(cp[j]);
88 static int skipAttribute(const char *cp, char *name, const char **value, int *val_len)
90 int i = skipName(cp, name);
94 i += skipSpace(cp + i);
99 i += skipSpace(cp + i);
100 if (cp[i] == '\"' || cp[i] == '\'')
104 while (cp[i] != tr && cp[i])
113 while (cp[i] && !strchr(SPACECHR ">", cp[i]))
120 i += skipSpace(cp + i);
124 static int tagAttrs(mp::HTMLParserEvent & event,
128 char attr_name[TAG_MAX_LEN];
129 const char *attr_value;
131 int i = skipSpace(cp);
132 while (cp[i] && cp[i] != '>' && cp[i] != '/')
134 int nor = skipAttribute(cp+i, attr_name, &attr_value, &val_len);
138 DEBUG(printf ("------ attr %s=%s\n", attr_name, dupe(attr_value, val_len)));
139 event.attribute(tagName, attr_name, attr_value, val_len);
150 static int tagStart(mp::HTMLParserEvent & event,
151 char *tagName, const char *cp, const char which)
153 int i = skipName(cp, tagName);
157 DEBUG(printf("------ tag close %s\n", tagName));
158 event.closeTag(tagName);
161 DEBUG(printf("------ dtd %s\n", tagName));
164 DEBUG(printf("------ pi %s\n", tagName));
167 DEBUG(printf("------ tag open %s\n", tagName));
168 event.openTagStart(tagName);
174 static int tagEnd(mp::HTMLParserEvent & event, const char *tagName, const char *cp)
178 while (cp[i] && cp[i] != '>')
186 event.anyTagEnd(tagName, close_it);
192 static void tagText(mp::HTMLParserEvent & event, const char *text_start, const char *text_end)
194 if (text_end - text_start) //got text to flush
196 DEBUG(printf("------ text %s\n", dupe(text_start, text_end-text_start)));
197 event.text(text_start, text_end-text_start);
201 static void parse_str(mp::HTMLParserEvent & event, const char *cp)
203 const char *text_start = cp;
204 const char *text_end = cp;
207 if (cp[0] == '<' && cp[1]) //tag?
212 if (!strchr(SPACECHR, cp[1])) //valid tag starts
214 tagText(event, text_start, text_end); //flush any text
215 char tagName[TAG_MAX_LEN];
219 cp += tagStart(event, tagName, cp, which);
221 else if (which == '!' || which == '?') //pi or dtd
224 cp += tagStart(event, tagName, cp, which);
228 cp += tagStart(event, tagName, cp, which);
229 cp += tagAttrs(event, tagName, cp);
231 cp += tagEnd(event, tagName, cp);
241 tagText(event, text_start, text_end); //flush any text
247 * c-file-style: "Stroustrup"
248 * indent-tabs-mode: nil
250 * vim: shiftwidth=4 tabstop=8 expandtab