1212
1313
1414#define CHAR_EOF -1
15+ #define CHAR_NULL 0
1516#define CHAR_BANG 33
1617#define CHAR_DOUBLE 34
1718#define CHAR_PERCENT 37
2324#define CHAR_GT 62
2425#define CHAR_QUESTION 63
2526#define CHAR_RIGHTB 93
27+ #define CHAR_TICK 96
2628
2729/* prototypes */
2830
@@ -41,6 +43,7 @@ static int h5_state_before_attribute_name(h5_state_t* hs);
4143static int h5_state_before_attribute_value (h5_state_t * hs );
4244static int h5_state_attribute_value_double_quote (h5_state_t * hs );
4345static int h5_state_attribute_value_single_quote (h5_state_t * hs );
46+ static int h5_state_attribute_value_back_quote (h5_state_t * hs );
4447static int h5_state_attribute_value_no_quote (h5_state_t * hs );
4548static int h5_state_after_attribute_value_quoted_state (h5_state_t * hs );
4649static int h5_state_comment (h5_state_t * hs );
@@ -60,16 +63,28 @@ static int h5_state_doctype(h5_state_t* hs);
6063/**
6164 * public function
6265 */
63- void libinjection_h5_init (h5_state_t * hs , const char * s , size_t len , int flags )
66+ void libinjection_h5_init (h5_state_t * hs , const char * s , size_t len , enum html5_flags flags )
6467{
6568 memset (hs , 0 , sizeof (h5_state_t ));
6669 hs -> s = s ;
6770 hs -> len = len ;
68- hs -> state = h5_state_data ;
69- if (flags == 0 ) {
71+
72+ switch (flags ) {
73+ case DATA_STATE :
7074 hs -> state = h5_state_data ;
71- } else {
72- assert (0 );
75+ break ;
76+ case VALUE_NO_QUOTE :
77+ hs -> state = h5_state_before_attribute_name ;
78+ break ;
79+ case VALUE_SINGLE_QUOTE :
80+ hs -> state = h5_state_attribute_value_single_quote ;
81+ break ;
82+ case VALUE_DOUBLE_QUOTE :
83+ hs -> state = h5_state_attribute_value_double_quote ;
84+ break ;
85+ case VALUE_BACK_QUOTE :
86+ hs -> state = h5_state_attribute_value_back_quote ;
87+ break ;
7388 }
7489}
7590
@@ -85,10 +100,18 @@ int libinjection_h5_next(h5_state_t* hs)
85100/**
86101 * Everything below here is private
87102 *
88- */
103+ */
104+
89105
90106static int h5_is_white (char ch )
91107{
108+ /*
109+ * \t = horizontal tab = 0x09
110+ * \n = newline = 0x0A
111+ * \v = vertical tab = 0x0B
112+ * \f = form feed = 0x0C
113+ * \r = cr = 0x0D
114+ */
92115 return strchr (" \t\n\v\f\r" , ch ) != NULL ;
93116}
94117
@@ -97,9 +120,17 @@ static int h5_skip_white(h5_state_t* hs)
97120 char ch ;
98121 while (hs -> pos < hs -> len ) {
99122 ch = hs -> s [hs -> pos ];
100- if (ch == ' ' ) {
123+ switch (ch ) {
124+ case 0x00 : /* IE only */
125+ case 0x20 :
126+ case 0x09 :
127+ case 0x0A :
128+ case 0x0B : /* IE only */
129+ case 0x0C :
130+ case 0x0D : /* IE only */
101131 hs -> pos += 1 ;
102- } else {
132+ break ;
133+ default :
103134 return ch ;
104135 }
105136 }
@@ -149,6 +180,9 @@ static int h5_state_tag_open(h5_state_t* hs)
149180 char ch ;
150181
151182 TRACE ();
183+ if (hs -> pos >= hs -> len ) {
184+ return 0 ;
185+ }
152186 ch = hs -> s [hs -> pos ];
153187 if (ch == CHAR_BANG ) {
154188 hs -> pos += 1 ;
@@ -167,6 +201,9 @@ static int h5_state_tag_open(h5_state_t* hs)
167201 return h5_state_bogus_comment2 (hs );
168202 } else if ((ch >= 'a' && ch <= 'z' ) || (ch >= 'A' && ch <= 'Z' )) {
169203 return h5_state_tag_name (hs );
204+ } else if (ch == CHAR_NULL ) {
205+ /* IE-ism NULL characters are ignored */
206+ return h5_state_tag_name (hs );
170207 } else {
171208 /* user input mistake in configuring state */
172209 if (hs -> pos == 0 ) {
@@ -197,7 +234,9 @@ static int h5_state_end_tag_open(h5_state_t* hs)
197234 } else if ((ch >= 'a' && ch <= 'z' ) || (ch >= 'A' && ch <= 'Z' )) {
198235 return h5_state_tag_name (hs );
199236 }
200- return h5_state_data (hs );
237+
238+ hs -> is_close = 0 ;
239+ return h5_state_bogus_comment (hs );
201240}
202241/*
203242 *
@@ -231,7 +270,12 @@ static int h5_state_tag_name(h5_state_t* hs)
231270 pos = hs -> pos ;
232271 while (pos < hs -> len ) {
233272 ch = hs -> s [pos ];
234- if (h5_is_white (ch )) {
273+ if (ch == 0 ) {
274+ /* special non-standard case */
275+ /* allow nulls in tag name */
276+ /* some old browsers apparently allow and ignore them */
277+ pos += 1 ;
278+ } else if (h5_is_white (ch )) {
235279 hs -> token_start = hs -> s + hs -> pos ;
236280 hs -> token_len = pos - hs -> pos ;
237281 hs -> token_type = TAG_NAME_OPEN ;
@@ -299,7 +343,7 @@ static int h5_state_before_attribute_name(h5_state_t* hs)
299343 default : {
300344 return h5_state_attribute_name (hs );
301345 }
302- }
346+ }
303347}
304348
305349static int h5_state_attribute_name (h5_state_t * hs )
@@ -308,7 +352,7 @@ static int h5_state_attribute_name(h5_state_t* hs)
308352 size_t pos ;
309353
310354 TRACE ();
311- pos = hs -> pos ;
355+ pos = hs -> pos + 1 ;
312356 while (pos < hs -> len ) {
313357 ch = hs -> s [pos ];
314358 if (h5_is_white (ch )) {
@@ -358,21 +402,19 @@ static int h5_state_attribute_name(h5_state_t* hs)
358402static int h5_state_after_attribute_name (h5_state_t * hs )
359403{
360404 int c ;
361- size_t pos ;
362405
363406 TRACE ();
364- pos = hs -> pos ;
365407 c = h5_skip_white (hs );
366408 switch (c ) {
367409 case CHAR_EOF : {
368410 return 0 ;
369411 }
370412 case CHAR_SLASH : {
371- hs -> pos = pos + 1 ;
413+ hs -> pos += 1 ;
372414 return h5_state_self_closing_start_tag (hs );
373415 }
374416 case CHAR_EQUALS : {
375- hs -> pos = pos + 1 ;
417+ hs -> pos += 1 ;
376418 return h5_state_before_attribute_value (hs );
377419 }
378420 case CHAR_GT : {
@@ -403,6 +445,9 @@ static int h5_state_before_attribute_value(h5_state_t* hs)
403445 return h5_state_attribute_value_double_quote (hs );
404446 } else if (c == CHAR_SINGLE ) {
405447 return h5_state_attribute_value_single_quote (hs );
448+ } else if (c == CHAR_TICK ) {
449+ /* NON STANDARD IE */
450+ return h5_state_attribute_value_back_quote (hs );
406451 } else {
407452 return h5_state_attribute_value_no_quote (hs );
408453 }
@@ -415,8 +460,16 @@ static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
415460
416461 TRACE ();
417462
418- /* skip quote */
419- hs -> pos += 1 ;
463+ /* skip initial quote in normal case.
464+ * don't do this "if (pos == 0)" since it means we have started
465+ * in a non-data state. given an input of '><foo
466+ * we want to make 0-length attribute name
467+ */
468+ if (hs -> pos > 0 ) {
469+ hs -> pos += 1 ;
470+ }
471+
472+
420473 idx = (const char * ) memchr (hs -> s + hs -> pos , qchar , hs -> len - hs -> pos );
421474 if (idx == NULL ) {
422475 hs -> token_start = hs -> s + hs -> pos ;
@@ -447,6 +500,13 @@ int h5_state_attribute_value_single_quote(h5_state_t* hs)
447500 return h5_state_attribute_value_quote (hs , CHAR_SINGLE );
448501}
449502
503+ static
504+ int h5_state_attribute_value_back_quote (h5_state_t * hs )
505+ {
506+ TRACE ();
507+ return h5_state_attribute_value_quote (hs , CHAR_TICK );
508+ }
509+
450510static int h5_state_attribute_value_no_quote (h5_state_t * hs )
451511{
452512 char ch ;
@@ -656,10 +716,13 @@ static int h5_state_comment(h5_state_t* hs)
656716 char ch ;
657717 const char * idx ;
658718 size_t pos ;
719+ size_t offset ;
720+ const char * end = hs -> s + hs -> len ;
659721
660722 TRACE ();
661723 pos = hs -> pos ;
662724 while (1 ) {
725+
663726 idx = (const char * ) memchr (hs -> s + pos , CHAR_DASH , hs -> len - pos );
664727
665728 /* did not find anything or has less than 3 chars left */
@@ -670,21 +733,62 @@ static int h5_state_comment(h5_state_t* hs)
670733 hs -> token_type = TAG_COMMENT ;
671734 return 1 ;
672735 }
673- ch = * (idx + 1 );
736+ offset = 1 ;
737+
738+ /* skip all nulls */
739+ while (idx + offset < end && * (idx + offset ) == 0 ) {
740+ offset += 1 ;
741+ }
742+ if (idx + offset == end ) {
743+ hs -> state = h5_state_eof ;
744+ hs -> token_start = hs -> s + hs -> pos ;
745+ hs -> token_len = hs -> len - hs -> pos ;
746+ hs -> token_type = TAG_COMMENT ;
747+ return 1 ;
748+ }
749+
750+ ch = * (idx + offset );
674751 if (ch != CHAR_DASH && ch != CHAR_BANG ) {
675752 pos = (size_t )(idx - hs -> s ) + 1 ;
676753 continue ;
677754 }
678- ch = * (idx + 2 );
755+
756+ /* need to test */
757+ #if 0
758+ /* skip all nulls */
759+ while (idx + offset < end && * (idx + offset ) == 0 ) {
760+ offset += 1 ;
761+ }
762+ if (idx + offset == end ) {
763+ hs -> state = h5_state_eof ;
764+ hs -> token_start = hs -> s + hs -> pos ;
765+ hs -> token_len = hs -> len - hs -> pos ;
766+ hs -> token_type = TAG_COMMENT ;
767+ return 1 ;
768+ }
769+ #endif
770+
771+ offset += 1 ;
772+ if (idx + offset == end ) {
773+ hs -> state = h5_state_eof ;
774+ hs -> token_start = hs -> s + hs -> pos ;
775+ hs -> token_len = hs -> len - hs -> pos ;
776+ hs -> token_type = TAG_COMMENT ;
777+ return 1 ;
778+ }
779+
780+
781+ ch = * (idx + offset );
679782 if (ch != CHAR_GT ) {
680783 pos = (size_t )(idx - hs -> s ) + 1 ;
681784 continue ;
682785 }
786+ offset += 1 ;
683787
684788 /* ends in --> or -!> */
685789 hs -> token_start = hs -> s + hs -> pos ;
686790 hs -> token_len = (size_t )(idx - hs -> s ) - hs -> pos ;
687- hs -> pos = (size_t )(idx - hs -> s ) + 3 ;
791+ hs -> pos = (size_t )(idx + offset - hs -> s );
688792 hs -> state = h5_state_data ;
689793 hs -> token_type = TAG_COMMENT ;
690794 return 1 ;
0 commit comments