c# - How do I strip valid XML from file that contains excess data and junk -
i have tried parsing regex not xml files being parsed!!
tag names not predefined means, solution should identify start , end of xml tag self
sample data goes this:
<xml data> random text <xml data> random text
as said in comments hard problem, without knowledge of data. there lot of edge cases.
the code below may work want. attempts find elements ('<element ...' or '<element/>', determine xml content finding end tag. checks trying parse xml , if false rejects xml.
this code written clarity, not performance or structure (it's mainline fall-thru). should give example can use starting point type of parsing described.
using system; using system.linq; using system.text; using system.xml.linq; namespace messyxml { class program { const string almostxml = @" @#$%random junk <fruits> <apples> pies </apples> <pears> tarts </pears> </fruits> junk fruits apples , pears can made pies , tarts. think pears<apples. edge case might <parts or /apples> <parts no='123'> pie plate </parts>"; static void main(string[] args) { console.writeline("extracting xml from:"); console.writeline(almostxml); console.writeline(); int = 0; var validxml = new stringbuilder(); while (i < almostxml.length) { if (almostxml[i] == '<') { // might xml start int ix = almostxml.indexofany(" >\t".toarray(), + 1); // check space, > , tab, may want // include other whitespace chars if (ix < 0) { ix = almostxml.indexof("/>", + 1); // might have <element/> if (ix >= 0) { // check if element name valid var xml = almostxml.substring(i, (ix + 2) - i); try { // see if xml var doc = xdocument.parse(xml); validxml.appendline(xml); = ix + 2; continue; // next iteration of while } catch (system.xml.xmlexception) { // nothing } } ix = almostxml.indexof(">", + 1); // might have <element/> } else { // found <element ... var ix2 = almostxml.indexof('>', ix); // '>' if (ix2 >= 0) { // build end tag var endtag = "</" + almostxml.substring(i + 1, (ix - i) - 1) + ">"; var endix = almostxml.indexof(endtag, ix2); if (endix >= 0) { var xml = almostxml.substring(i, (endix + endtag.length) - i); try { // see if xml var doc = xdocument.parse(xml); validxml.appendline(xml); = (endix + endtag.length); continue; // next iteration of while } catch (system.xml.xmlexception) { // nothing } } } } } i++; } console.writeline("-----------"); console.writeline("valid xml found:"); console.writeline(validxml.tostring()); console.readkey(); } } }
Comments
Post a Comment