1
/*
2
 * Hurl (https://hurl.dev)
3
 * Copyright (C) 2024 Orange
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 *          http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 */
18
use std::os::raw::{c_char, c_int, c_void};
19
use std::ptr;
20

            
21
use libxml::bindings::{
22
    xmlChar, xmlCreatePushParserCtxt, xmlFreeParserCtxt, xmlParseChunk, xmlSAXHandlerPtr,
23
};
24

            
25
use crate::parser::error::*;
26
use crate::parser::ParseResult;
27
use crate::reader::Reader;
28

            
29
/// Parses a text buffer until a valid XML has been found.
30
/// We're using a SAX XML parser because we need to stop the parsing at the byte position where
31
/// an XML text is detected.
32
/// For example, when we have this kind of Hurl file:
33
///
34
/// ```hurl
35
/// POST https://foo.com
36
/// <?xml version="1.0"?>
37
/// <catalog>
38
///   <book id="bk101">
39
///     <author>Gambardella, Matthew</author>
40
///     <title>XML Developer's Guide</title>
41
///   </book>
42
/// </catalog>
43
/// HTTP 201
44
/// ```
45
///
46
/// As there is no "formal" end of body, we need to parse the string until we detect at the precise
47
/// byte a possible valid XML body.
48
///
49
24475
pub fn parse(reader: &mut Reader) -> ParseResult<String> {
50
24475
    // We test if our first character is a start of an XML text.
51
24475
    // If not, we return immediately a recoverable error.
52
24475
    // Otherwise, we start parsing the supposed XML buffer. Any subsequent error will be a
53
24475
    // non-recoverable error.
54
24475
    let c = reader.peek();
55
24475
    match c {
56
110
        Some('<') => {}
57
        _ => {
58
24365
            return Err(ParseError::new(
59
24365
                reader.cursor().pos,
60
24365
                true,
61
24365
                ParseErrorKind::Xml,
62
24365
            ))
63
        }
64
    }
65

            
66
110
    let mut buf = String::new();
67
110
    let mut parser = new_sax_parser();
68
110
    let mut parser_context = ParserContext::new();
69
110

            
70
110
    // We use libxml SAX parser to identify the end of the XML body.
71
110
    // We feed the SAX parser chars by chars (Rust char), so chunks are UFT-8 bytes,
72
110
    // 1 byte to 4 bytes long. The detection of the body end is done when receiving a closing
73
110
    // element event by testing the depth of the XML tree.
74
110
    unsafe {
75
110
        let context = xmlCreatePushParserCtxt(
76
110
            &mut parser as xmlSAXHandlerPtr,
77
110
            &mut parser_context as *mut ParserContext as *mut c_void,
78
110
            ptr::null(),
79
110
            0,
80
110
            ptr::null(),
81
110
        );
82
110

            
83
110
        // We keep track of the previous char reader position, to accurately raise eventual error.
84
110
        let mut prev_pos = reader.cursor().pos;
85

            
86
446615
        while let Some(c) = reader.read() {
87
446615
            buf.push(c);
88
446615

            
89
446615
            // We feed the parser chars by chars.
90
446615
            // A buffer of length four is large enough to encode any char.
91
446615
            let mut bytes = [0u8; 4];
92
446615
            let end = reader.is_eof() as c_int;
93
446615
            let bytes = c.encode_utf8(&mut bytes);
94
446615
            let count = bytes.len() as c_int;
95
446615
            let bytes = bytes.as_ptr() as *const c_char;
96
446615
            let ret = xmlParseChunk(context, bytes, count, end);
97
446615
            if ret != 0 {
98
5
                xmlFreeParserCtxt(context);
99
5
                return Err(ParseError::new(prev_pos, false, ParseErrorKind::Xml));
100
            }
101

            
102
            // End of the XML body is detected with a closing element event and depth of the tree.
103
            // There is also a closing document event but it's not always raised at the exact
104
            // closing `>` position.
105
446610
            if std::matches!(parser_context.state, ParserState::EndElement)
106
159025
                && parser_context.depth == 0
107
            {
108
105
                break;
109
            }
110
446505
            prev_pos = reader.cursor().pos;
111
        }
112

            
113
105
        xmlFreeParserCtxt(context);
114
    }
115
105

            
116
105
    Ok(buf)
117
}
118

            
119
/// A context for the SAX parser, containing a `state` and the current tree `depth`.
120
struct ParserContext {
121
    depth: usize,
122
    state: ParserState,
123
}
124

            
125
impl ParserContext {
126
110
    fn new() -> ParserContext {
127
110
        ParserContext {
128
110
            depth: 0,
129
110
            state: ParserState::Created,
130
        }
131
    }
132
}
133

            
134
enum ParserState {
135
    Created,
136
    StartDocument,
137
    EndDocument,
138
    StartElement,
139
    EndElement,
140
}
141

            
142
110
fn new_sax_parser() -> libxml::bindings::xmlSAXHandler {
143
110
    libxml::bindings::xmlSAXHandler {
144
110
        internalSubset: None,
145
110
        isStandalone: None,
146
110
        hasInternalSubset: None,
147
110
        hasExternalSubset: None,
148
110
        resolveEntity: None,
149
110
        getEntity: None,
150
110
        entityDecl: None,
151
110
        notationDecl: None,
152
110
        attributeDecl: None,
153
110
        elementDecl: None,
154
110
        unparsedEntityDecl: None,
155
110
        setDocumentLocator: None,
156
110
        startDocument: Some(on_start_document),
157
110
        endDocument: Some(on_end_document),
158
110
        startElement: None,
159
110
        endElement: None,
160
110
        reference: None,
161
110
        characters: None,
162
110
        ignorableWhitespace: None,
163
110
        processingInstruction: None,
164
110
        comment: None,
165
110
        warning: None,
166
110
        error: None,
167
110
        fatalError: None,
168
110
        getParameterEntity: None,
169
110
        cdataBlock: None,
170
110
        externalSubset: None,
171
110
        initialized: libxml::bindings::XML_SAX2_MAGIC,
172
110
        _private: ptr::null_mut(),
173
110
        startElementNs: Some(on_start_element),
174
110
        endElementNs: Some(on_end_element),
175
110
        serror: None,
176
    }
177
}
178

            
179
/// Called when the document start being processed.
180
110
unsafe extern "C" fn on_start_document(ctx: *mut c_void) {
181
110
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
182
110
    context.state = ParserState::StartDocument;
183
}
184

            
185
/// Called when the document end has been detected.
186
5
unsafe extern "C" fn on_end_document(ctx: *mut c_void) {
187
5
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
188
5
    context.state = ParserState::EndDocument;
189
}
190

            
191
/// Called when an opening tag has been processed.
192
6975
unsafe extern "C" fn on_start_element(
193
6975
    ctx: *mut c_void,
194
6975
    _local_name: *const xmlChar,
195
6975
    _prefix: *const xmlChar,
196
6975
    _uri: *const xmlChar,
197
6975
    _nb_namespaces: c_int,
198
6975
    _namespaces: *mut *const xmlChar,
199
6975
    _nb_attributes: c_int,
200
6975
    _nb_defaulted: c_int,
201
6975
    _attributes: *mut *const xmlChar,
202
6975
) {
203
6975
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
204
6975
    context.state = ParserState::StartElement;
205
6975
    context.depth += 1;
206
}
207

            
208
/// Called when the end of an element has been detected.
209
6965
unsafe extern "C" fn on_end_element(
210
6965
    ctx: *mut c_void,
211
6965
    _local_name: *const xmlChar,
212
6965
    _prefix: *const xmlChar,
213
6965
    _uri: *const xmlChar,
214
6965
) {
215
6965
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
216
6965
    context.state = ParserState::EndElement;
217
6965
    context.depth -= 1;
218
}
219

            
220
#[cfg(test)]
221
mod tests {
222
    use super::*;
223
    use crate::reader::Pos;
224

            
225
    #[test]
226
    fn parse_xml_brute_force_errors() {
227
        let mut reader = Reader::new("");
228
        let error = parse(&mut reader).err().unwrap();
229
        assert_eq!(error.pos, Pos { line: 1, column: 1 });
230
        assert_eq!(error.kind, ParseErrorKind::Xml);
231
        assert!(error.recoverable);
232

            
233
        let mut reader = Reader::new("x");
234
        let error = parse(&mut reader).err().unwrap();
235
        assert_eq!(error.pos, Pos { line: 1, column: 1 });
236
        assert_eq!(error.kind, ParseErrorKind::Xml);
237
        assert!(error.recoverable);
238

            
239
        let mut reader = Reader::new("<<");
240
        let error = parse(&mut reader).err().unwrap();
241
        assert_eq!(error.pos, Pos { line: 1, column: 2 });
242
        assert_eq!(error.kind, ParseErrorKind::Xml);
243
        assert!(!error.recoverable);
244

            
245
        let mut reader = Reader::new("<users><user /></users");
246
        let error = parse(&mut reader).err().unwrap();
247
        assert_eq!(
248
            error.pos,
249
            Pos {
250
                line: 1,
251
                column: 22
252
            }
253
        );
254
        assert_eq!(error.kind, ParseErrorKind::Xml);
255

            
256
        let mut reader = Reader::new("<users aa><user /></users");
257
        let error = parse(&mut reader).err().unwrap();
258
        assert_eq!(
259
            error.pos,
260
            Pos {
261
                line: 1,
262
                column: 10
263
            }
264
        );
265
        assert_eq!(error.kind, ParseErrorKind::Xml);
266
    }
267

            
268
    #[test]
269
    fn parse_xml_brute_force_ok() {
270
        let mut reader = Reader::new("<users><user /></users>");
271
        assert_eq!(
272
            parse(&mut reader).unwrap(),
273
            String::from("<users><user /></users>")
274
        );
275
        assert_eq!(reader.cursor().index, 23);
276

            
277
        let mut reader = Reader::new("<users><user /></users>xx");
278
        assert_eq!(
279
            parse(&mut reader).unwrap(),
280
            String::from("<users><user /></users>")
281
        );
282
        assert_eq!(reader.cursor().index, 23);
283
        assert_eq!(reader.peek_n(2), String::from("xx"));
284

            
285
        let mut reader = Reader::new("<?xml version=\"1.0\"?><users/>xxx");
286
        assert_eq!(
287
            parse(&mut reader).unwrap(),
288
            String::from("<?xml version=\"1.0\"?><users/>")
289
        );
290
        assert_eq!(reader.cursor().index, 29);
291
    }
292

            
293
    #[test]
294
    fn parse_xml_soap_utf8() {
295
        let xml = r#"<?xml version='1.0' encoding='UTF-8'?>
296
<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/">
297
    <soapenv:Body>
298
        <ns31:UploadInboundResponseElement xmlns:ns31="http://www.example.com/schema/xyzWS">
299
            <ns31:UploadInboundResult>&lt;?xml version="1.0" encoding="UTF-8" ?>&lt;ATKCST>&lt;Head>&lt;FunCode>9000&lt;/FunCode>&lt;Remark>接收数据成功&lt;/Remark>&lt;/Head>&lt;/ATKCST></ns31:UploadInboundResult>
300
        </ns31:UploadInboundResponseElement>
301
    </soapenv:Body>
302
</soapenv:Envelope>"#;
303

            
304
        // A valid complete XML
305
        let input = xml;
306
        let output = xml;
307
        let mut reader = Reader::new(input);
308
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
309
        assert_eq!(reader.cursor().index, 520);
310

            
311
        // A XML with data padding
312
        let input = format!("{xml} xx xx xx xx");
313
        let output = xml;
314
        let mut reader = Reader::new(&input);
315
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
316
        assert_eq!(reader.cursor().index, 520);
317

            
318
        // Two consecutive XML
319
        let input = format!("{xml}{xml}");
320
        let output = xml;
321
        let mut reader = Reader::new(&input);
322
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
323
        assert_eq!(reader.cursor().index, 520);
324

            
325
        let mut reader = Reader::new(&input);
326
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
327
        assert_eq!(reader.cursor().index, 520);
328
    }
329

            
330
    #[test]
331
    fn parse_xml_books_with_entry_response_start() {
332
        let xml = r#"<?xml version="1.0"?>
333
<catalog>
334
   <book id="bk101">
335
      <author>Gambardella, Matthew</author>
336
      <title>XML Developer's Guide</title>
337
      <genre>Computer</genre>
338
      <price>44.95</price>
339
      <publish_date>2000-10-01</publish_date>
340
      <description>An in-depth look at creating applications
341
      with XML.</description>
342
   </book>
343
   <book id="bk102">
344
      <author>Ralls, Kim</author>
345
      <title>Midnight Rain</title>
346
      <genre>Fantasy</genre>
347
      <price>5.95</price>
348
      <publish_date>2000-12-16</publish_date>
349
      <description>A former architect battles corporate zombies,
350
      an evil sorceress, and her own childhood to become queen
351
      of the world.</description>
352
   </book>
353
   <book id="bk103">
354
      <author>Corets, Eva</author>
355
      <title>Maeve Ascendant</title>
356
      <genre>Fantasy</genre>
357
      <price>5.95</price>
358
      <publish_date>2000-11-17</publish_date>
359
      <description>After the collapse of a nanotechnology
360
      society in England, the young survivors lay the
361
      foundation for a new society.</description>
362
   </book>
363
   <book id="bk104">
364
      <author>Corets, Eva</author>
365
      <title>Oberon's Legacy</title>
366
      <genre>Fantasy</genre>
367
      <price>5.95</price>
368
      <publish_date>2001-03-10</publish_date>
369
      <description>In post-apocalypse England, the mysterious
370
      agent known only as Oberon helps to create a new life
371
      for the inhabitants of London. Sequel to Maeve
372
      Ascendant.</description>
373
   </book>
374
   <book id="bk105">
375
      <author>Corets, Eva</author>
376
      <title>The Sundered Grail</title>
377
      <genre>Fantasy</genre>
378
      <price>5.95</price>
379
      <publish_date>2001-09-10</publish_date>
380
      <description>The two daughters of Maeve, half-sisters,
381
      battle one another for control of England. Sequel to
382
      Oberon's Legacy.</description>
383
   </book>
384
   <book id="bk106">
385
      <author>Randall, Cynthia</author>
386
      <title>Lover Birds</title>
387
      <genre>Romance</genre>
388
      <price>4.95</price>
389
      <publish_date>2000-09-02</publish_date>
390
      <description>When Carla meets Paul at an ornithology
391
      conference, tempers fly as feathers get ruffled.</description>
392
   </book>
393
   <book id="bk107">
394
      <author>Thurman, Paula</author>
395
      <title>Splish Splash</title>
396
      <genre>Romance</genre>
397
      <price>4.95</price>
398
      <publish_date>2000-11-02</publish_date>
399
      <description>A deep sea diver finds true love twenty
400
      thousand leagues beneath the sea.</description>
401
   </book>
402
   <book id="bk108">
403
      <author>Knorr, Stefan</author>
404
      <title>Creepy Crawlies</title>
405
      <genre>Horror</genre>
406
      <price>4.95</price>
407
      <publish_date>2000-12-06</publish_date>
408
      <description>An anthology of horror stories about roaches,
409
      centipedes, scorpions  and other insects.</description>
410
   </book>
411
   <book id="bk109">
412
      <author>Kress, Peter</author>
413
      <title>Paradox Lost</title>
414
      <genre>Science Fiction</genre>
415
      <price>6.95</price>
416
      <publish_date>2000-11-02</publish_date>
417
      <description>After an inadvertent trip through a Heisenberg
418
      Uncertainty Device, James Salway discovers the problems
419
      of being quantum.</description>
420
   </book>
421
   <book id="bk110">
422
      <author>O'Brien, Tim</author>
423
      <title>Microsoft .NET: The Programming Bible</title>
424
      <genre>Computer</genre>
425
      <price>36.95</price>
426
      <publish_date>2000-12-09</publish_date>
427
      <description>Microsoft's .NET initiative is explored in
428
      detail in this deep programmer's reference.</description>
429
   </book>
430
   <book id="bk111">
431
      <author>O'Brien, Tim</author>
432
      <title>MSXML3: A Comprehensive Guide</title>
433
      <genre>Computer</genre>
434
      <price>36.95</price>
435
      <publish_date>2000-12-01</publish_date>
436
      <description>The Microsoft MSXML3 parser is covered in
437
      detail, with attention to XML DOM interfaces, XSLT processing,
438
      SAX and more.</description>
439
   </book>
440
   <book id="bk112">
441
      <author>Galos, Mike</author>
442
      <title>Visual Studio 7: A Comprehensive Guide</title>
443
      <genre>Computer</genre>
444
      <price>49.95</price>
445
      <publish_date>2001-04-16</publish_date>
446
      <description>Microsoft Visual Studio 7 is explored in depth,
447
      looking at how Visual Basic, Visual C++, C#, and ASP+ are
448
      integrated into a comprehensive development
449
      environment.</description>
450
   </book>
451
</catalog>"#;
452

            
453
        let chunk = format!("{xml}\nHTTP 200");
454
        let mut reader = Reader::new(&chunk);
455
        assert_eq!(parse(&mut reader).unwrap(), String::from(xml),);
456
        assert_eq!(reader.cursor().index, 4411);
457
    }
458
}