1
/*
2
 * Hurl (https://hurl.dev)
3
 * Copyright (C) 2024 Orange
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 *          http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 */
18
use std::os::raw::{c_char, c_int, c_void};
19
use std::ptr;
20

            
21
use libxml::bindings::{
22
    xmlChar, xmlCreatePushParserCtxt, xmlFreeParserCtxt, xmlParseChunk, xmlSAXHandlerPtr,
23
};
24

            
25
use crate::parser::{ParseError, ParseErrorKind, ParseResult};
26
use crate::reader::Reader;
27

            
28
/// Parses a text buffer until a valid XML has been found.
29
/// We're using a SAX XML parser because we need to stop the parsing at the byte position where
30
/// an XML text is detected.
31
/// For example, when we have this kind of Hurl file:
32
///
33
/// ```hurl
34
/// POST https://foo.com
35
/// <?xml version="1.0"?>
36
/// <catalog>
37
///   <book id="bk101">
38
///     <author>Gambardella, Matthew</author>
39
///     <title>XML Developer's Guide</title>
40
///   </book>
41
/// </catalog>
42
/// HTTP 201
43
/// ```
44
///
45
/// As there is no "formal" end of body, we need to parse the string until we detect at the precise
46
/// byte a possible valid XML body.
47
///
48
24530
pub fn parse(reader: &mut Reader) -> ParseResult<String> {
49
24530
    // We test if our first character is a start of an XML text.
50
24530
    // If not, we return immediately a recoverable error.
51
24530
    // Otherwise, we start parsing the supposed XML buffer. Any subsequent error will be a
52
24530
    // non-recoverable error.
53
24530
    let c = reader.peek();
54
24530
    match c {
55
110
        Some('<') => {}
56
        _ => {
57
24420
            return Err(ParseError::new(
58
24420
                reader.cursor().pos,
59
24420
                true,
60
24420
                ParseErrorKind::Xml,
61
24420
            ))
62
        }
63
    }
64

            
65
110
    let mut buf = String::new();
66
110
    let mut parser = new_sax_parser();
67
110
    let mut parser_context = ParserContext::new();
68
110

            
69
110
    // We use libxml SAX parser to identify the end of the XML body.
70
110
    // We feed the SAX parser chars by chars (Rust char), so chunks are UFT-8 bytes,
71
110
    // 1 byte to 4 bytes long. The detection of the body end is done when receiving a closing
72
110
    // element event by testing the depth of the XML tree.
73
110
    unsafe {
74
110
        let context = xmlCreatePushParserCtxt(
75
110
            &mut parser as xmlSAXHandlerPtr,
76
110
            &mut parser_context as *mut ParserContext as *mut c_void,
77
110
            ptr::null(),
78
110
            0,
79
110
            ptr::null(),
80
110
        );
81
110

            
82
110
        // We keep track of the previous char reader position, to accurately raise eventual error.
83
110
        let mut prev_pos = reader.cursor().pos;
84

            
85
446615
        while let Some(c) = reader.read() {
86
446615
            buf.push(c);
87
446615

            
88
446615
            // We feed the parser chars by chars.
89
446615
            // A buffer of length four is large enough to encode any char.
90
446615
            let mut bytes = [0u8; 4];
91
446615
            let end = reader.is_eof() as c_int;
92
446615
            let bytes = c.encode_utf8(&mut bytes);
93
446615
            let count = bytes.len() as c_int;
94
446615
            let bytes = bytes.as_ptr() as *const c_char;
95
446615
            let ret = xmlParseChunk(context, bytes, count, end);
96
446615
            if ret != 0 {
97
5
                xmlFreeParserCtxt(context);
98
5
                return Err(ParseError::new(prev_pos, false, ParseErrorKind::Xml));
99
            }
100

            
101
            // End of the XML body is detected with a closing element event and depth of the tree.
102
            // There is also a closing document event but it's not always raised at the exact
103
            // closing `>` position.
104
446610
            if std::matches!(parser_context.state, ParserState::EndElement)
105
159025
                && parser_context.depth == 0
106
            {
107
105
                break;
108
            }
109
446505
            prev_pos = reader.cursor().pos;
110
        }
111

            
112
105
        xmlFreeParserCtxt(context);
113
    }
114
105

            
115
105
    Ok(buf)
116
}
117

            
118
/// A context for the SAX parser, containing a `state` and the current tree `depth`.
119
struct ParserContext {
120
    depth: usize,
121
    state: ParserState,
122
}
123

            
124
impl ParserContext {
125
110
    fn new() -> ParserContext {
126
110
        ParserContext {
127
110
            depth: 0,
128
110
            state: ParserState::Created,
129
        }
130
    }
131
}
132

            
133
enum ParserState {
134
    Created,
135
    StartDocument,
136
    EndDocument,
137
    StartElement,
138
    EndElement,
139
}
140

            
141
110
fn new_sax_parser() -> libxml::bindings::xmlSAXHandler {
142
110
    libxml::bindings::xmlSAXHandler {
143
110
        internalSubset: None,
144
110
        isStandalone: None,
145
110
        hasInternalSubset: None,
146
110
        hasExternalSubset: None,
147
110
        resolveEntity: None,
148
110
        getEntity: None,
149
110
        entityDecl: None,
150
110
        notationDecl: None,
151
110
        attributeDecl: None,
152
110
        elementDecl: None,
153
110
        unparsedEntityDecl: None,
154
110
        setDocumentLocator: None,
155
110
        startDocument: Some(on_start_document),
156
110
        endDocument: Some(on_end_document),
157
110
        startElement: None,
158
110
        endElement: None,
159
110
        reference: None,
160
110
        characters: None,
161
110
        ignorableWhitespace: None,
162
110
        processingInstruction: None,
163
110
        comment: None,
164
110
        warning: None,
165
110
        error: None,
166
110
        fatalError: None,
167
110
        getParameterEntity: None,
168
110
        cdataBlock: None,
169
110
        externalSubset: None,
170
110
        initialized: libxml::bindings::XML_SAX2_MAGIC,
171
110
        _private: ptr::null_mut(),
172
110
        startElementNs: Some(on_start_element),
173
110
        endElementNs: Some(on_end_element),
174
110
        serror: None,
175
    }
176
}
177

            
178
/// Called when the document start being processed.
179
110
unsafe extern "C" fn on_start_document(ctx: *mut c_void) {
180
110
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
181
110
    context.state = ParserState::StartDocument;
182
}
183

            
184
/// Called when the document end has been detected.
185
5
unsafe extern "C" fn on_end_document(ctx: *mut c_void) {
186
5
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
187
5
    context.state = ParserState::EndDocument;
188
}
189

            
190
/// Called when an opening tag has been processed.
191
6975
unsafe extern "C" fn on_start_element(
192
6975
    ctx: *mut c_void,
193
6975
    _local_name: *const xmlChar,
194
6975
    _prefix: *const xmlChar,
195
6975
    _uri: *const xmlChar,
196
6975
    _nb_namespaces: c_int,
197
6975
    _namespaces: *mut *const xmlChar,
198
6975
    _nb_attributes: c_int,
199
6975
    _nb_defaulted: c_int,
200
6975
    _attributes: *mut *const xmlChar,
201
6975
) {
202
6975
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
203
6975
    context.state = ParserState::StartElement;
204
6975
    context.depth += 1;
205
}
206

            
207
/// Called when the end of an element has been detected.
208
6965
unsafe extern "C" fn on_end_element(
209
6965
    ctx: *mut c_void,
210
6965
    _local_name: *const xmlChar,
211
6965
    _prefix: *const xmlChar,
212
6965
    _uri: *const xmlChar,
213
6965
) {
214
6965
    let context: &mut ParserContext = unsafe { &mut *(ctx as *mut ParserContext) };
215
6965
    context.state = ParserState::EndElement;
216
6965
    context.depth -= 1;
217
}
218

            
219
#[cfg(test)]
220
mod tests {
221
    use super::*;
222
    use crate::reader::Pos;
223

            
224
    #[test]
225
    fn parse_xml_brute_force_errors() {
226
        let mut reader = Reader::new("");
227
        let error = parse(&mut reader).err().unwrap();
228
        assert_eq!(error.pos, Pos { line: 1, column: 1 });
229
        assert_eq!(error.kind, ParseErrorKind::Xml);
230
        assert!(error.recoverable);
231

            
232
        let mut reader = Reader::new("x");
233
        let error = parse(&mut reader).err().unwrap();
234
        assert_eq!(error.pos, Pos { line: 1, column: 1 });
235
        assert_eq!(error.kind, ParseErrorKind::Xml);
236
        assert!(error.recoverable);
237

            
238
        let mut reader = Reader::new("<<");
239
        let error = parse(&mut reader).err().unwrap();
240
        assert_eq!(error.pos, Pos { line: 1, column: 2 });
241
        assert_eq!(error.kind, ParseErrorKind::Xml);
242
        assert!(!error.recoverable);
243

            
244
        let mut reader = Reader::new("<users><user /></users");
245
        let error = parse(&mut reader).err().unwrap();
246
        assert_eq!(
247
            error.pos,
248
            Pos {
249
                line: 1,
250
                column: 22
251
            }
252
        );
253
        assert_eq!(error.kind, ParseErrorKind::Xml);
254

            
255
        let mut reader = Reader::new("<users aa><user /></users");
256
        let error = parse(&mut reader).err().unwrap();
257
        assert_eq!(
258
            error.pos,
259
            Pos {
260
                line: 1,
261
                column: 10
262
            }
263
        );
264
        assert_eq!(error.kind, ParseErrorKind::Xml);
265
    }
266

            
267
    #[test]
268
    fn parse_xml_brute_force_ok() {
269
        let mut reader = Reader::new("<users><user /></users>");
270
        assert_eq!(
271
            parse(&mut reader).unwrap(),
272
            String::from("<users><user /></users>")
273
        );
274
        assert_eq!(reader.cursor().index, 23);
275

            
276
        let mut reader = Reader::new("<users><user /></users>xx");
277
        assert_eq!(
278
            parse(&mut reader).unwrap(),
279
            String::from("<users><user /></users>")
280
        );
281
        assert_eq!(reader.cursor().index, 23);
282
        assert_eq!(reader.peek_n(2), String::from("xx"));
283

            
284
        let mut reader = Reader::new("<?xml version=\"1.0\"?><users/>xxx");
285
        assert_eq!(
286
            parse(&mut reader).unwrap(),
287
            String::from("<?xml version=\"1.0\"?><users/>")
288
        );
289
        assert_eq!(reader.cursor().index, 29);
290
    }
291

            
292
    #[test]
293
    fn parse_xml_soap_utf8() {
294
        let xml = r#"<?xml version='1.0' encoding='UTF-8'?>
295
<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/">
296
    <soapenv:Body>
297
        <ns31:UploadInboundResponseElement xmlns:ns31="http://www.example.com/schema/xyzWS">
298
            <ns31:UploadInboundResult>&lt;?xml version="1.0" encoding="UTF-8" ?>&lt;ATKCST>&lt;Head>&lt;FunCode>9000&lt;/FunCode>&lt;Remark>接收数据成功&lt;/Remark>&lt;/Head>&lt;/ATKCST></ns31:UploadInboundResult>
299
        </ns31:UploadInboundResponseElement>
300
    </soapenv:Body>
301
</soapenv:Envelope>"#;
302

            
303
        // A valid complete XML
304
        let input = xml;
305
        let output = xml;
306
        let mut reader = Reader::new(input);
307
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
308
        assert_eq!(reader.cursor().index, 520);
309

            
310
        // A XML with data padding
311
        let input = format!("{xml} xx xx xx xx");
312
        let output = xml;
313
        let mut reader = Reader::new(&input);
314
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
315
        assert_eq!(reader.cursor().index, 520);
316

            
317
        // Two consecutive XML
318
        let input = format!("{xml}{xml}");
319
        let output = xml;
320
        let mut reader = Reader::new(&input);
321
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
322
        assert_eq!(reader.cursor().index, 520);
323

            
324
        let mut reader = Reader::new(&input);
325
        assert_eq!(parse(&mut reader).unwrap(), String::from(output),);
326
        assert_eq!(reader.cursor().index, 520);
327
    }
328

            
329
    #[test]
330
    fn parse_xml_books_with_entry_response_start() {
331
        let xml = r#"<?xml version="1.0"?>
332
<catalog>
333
   <book id="bk101">
334
      <author>Gambardella, Matthew</author>
335
      <title>XML Developer's Guide</title>
336
      <genre>Computer</genre>
337
      <price>44.95</price>
338
      <publish_date>2000-10-01</publish_date>
339
      <description>An in-depth look at creating applications
340
      with XML.</description>
341
   </book>
342
   <book id="bk102">
343
      <author>Ralls, Kim</author>
344
      <title>Midnight Rain</title>
345
      <genre>Fantasy</genre>
346
      <price>5.95</price>
347
      <publish_date>2000-12-16</publish_date>
348
      <description>A former architect battles corporate zombies,
349
      an evil sorceress, and her own childhood to become queen
350
      of the world.</description>
351
   </book>
352
   <book id="bk103">
353
      <author>Corets, Eva</author>
354
      <title>Maeve Ascendant</title>
355
      <genre>Fantasy</genre>
356
      <price>5.95</price>
357
      <publish_date>2000-11-17</publish_date>
358
      <description>After the collapse of a nanotechnology
359
      society in England, the young survivors lay the
360
      foundation for a new society.</description>
361
   </book>
362
   <book id="bk104">
363
      <author>Corets, Eva</author>
364
      <title>Oberon's Legacy</title>
365
      <genre>Fantasy</genre>
366
      <price>5.95</price>
367
      <publish_date>2001-03-10</publish_date>
368
      <description>In post-apocalypse England, the mysterious
369
      agent known only as Oberon helps to create a new life
370
      for the inhabitants of London. Sequel to Maeve
371
      Ascendant.</description>
372
   </book>
373
   <book id="bk105">
374
      <author>Corets, Eva</author>
375
      <title>The Sundered Grail</title>
376
      <genre>Fantasy</genre>
377
      <price>5.95</price>
378
      <publish_date>2001-09-10</publish_date>
379
      <description>The two daughters of Maeve, half-sisters,
380
      battle one another for control of England. Sequel to
381
      Oberon's Legacy.</description>
382
   </book>
383
   <book id="bk106">
384
      <author>Randall, Cynthia</author>
385
      <title>Lover Birds</title>
386
      <genre>Romance</genre>
387
      <price>4.95</price>
388
      <publish_date>2000-09-02</publish_date>
389
      <description>When Carla meets Paul at an ornithology
390
      conference, tempers fly as feathers get ruffled.</description>
391
   </book>
392
   <book id="bk107">
393
      <author>Thurman, Paula</author>
394
      <title>Splish Splash</title>
395
      <genre>Romance</genre>
396
      <price>4.95</price>
397
      <publish_date>2000-11-02</publish_date>
398
      <description>A deep sea diver finds true love twenty
399
      thousand leagues beneath the sea.</description>
400
   </book>
401
   <book id="bk108">
402
      <author>Knorr, Stefan</author>
403
      <title>Creepy Crawlies</title>
404
      <genre>Horror</genre>
405
      <price>4.95</price>
406
      <publish_date>2000-12-06</publish_date>
407
      <description>An anthology of horror stories about roaches,
408
      centipedes, scorpions  and other insects.</description>
409
   </book>
410
   <book id="bk109">
411
      <author>Kress, Peter</author>
412
      <title>Paradox Lost</title>
413
      <genre>Science Fiction</genre>
414
      <price>6.95</price>
415
      <publish_date>2000-11-02</publish_date>
416
      <description>After an inadvertent trip through a Heisenberg
417
      Uncertainty Device, James Salway discovers the problems
418
      of being quantum.</description>
419
   </book>
420
   <book id="bk110">
421
      <author>O'Brien, Tim</author>
422
      <title>Microsoft .NET: The Programming Bible</title>
423
      <genre>Computer</genre>
424
      <price>36.95</price>
425
      <publish_date>2000-12-09</publish_date>
426
      <description>Microsoft's .NET initiative is explored in
427
      detail in this deep programmer's reference.</description>
428
   </book>
429
   <book id="bk111">
430
      <author>O'Brien, Tim</author>
431
      <title>MSXML3: A Comprehensive Guide</title>
432
      <genre>Computer</genre>
433
      <price>36.95</price>
434
      <publish_date>2000-12-01</publish_date>
435
      <description>The Microsoft MSXML3 parser is covered in
436
      detail, with attention to XML DOM interfaces, XSLT processing,
437
      SAX and more.</description>
438
   </book>
439
   <book id="bk112">
440
      <author>Galos, Mike</author>
441
      <title>Visual Studio 7: A Comprehensive Guide</title>
442
      <genre>Computer</genre>
443
      <price>49.95</price>
444
      <publish_date>2001-04-16</publish_date>
445
      <description>Microsoft Visual Studio 7 is explored in depth,
446
      looking at how Visual Basic, Visual C++, C#, and ASP+ are
447
      integrated into a comprehensive development
448
      environment.</description>
449
   </book>
450
</catalog>"#;
451

            
452
        let chunk = format!("{xml}\nHTTP 200");
453
        let mut reader = Reader::new(&chunk);
454
        assert_eq!(parse(&mut reader).unwrap(), String::from(xml),);
455
        assert_eq!(reader.cursor().index, 4411);
456
    }
457
}