... | ... |
@@ -1,4 +1,4 @@ |
1 | 1 |
jpurexml |
2 | 2 |
======== |
3 | 3 |
|
4 |
-XML parser in pure Java code so tools like Google's PlayN can transcompile it to all its target platforms |
|
5 | 4 |
\ No newline at end of file |
5 |
+Simple all-in-memory XML parser in pure Java code so tools like Google's PlayN can transcompile it to all its target platforms |
|
6 | 6 |
\ No newline at end of file |
7 | 7 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,107 @@ |
1 |
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|
2 |
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|
3 |
+ <modelVersion>4.0.0</modelVersion> |
|
4 |
+ <groupId>com.github.asilvestre</groupId> |
|
5 |
+ <artifactId>jpurexml</artifactId> |
|
6 |
+ <version>0.9</version> |
|
7 |
+ <packaging>jar</packaging> |
|
8 |
+ <name>jpurexml</name> |
|
9 |
+ <url>https://github.com/asilvestre/jpurexml</url> |
|
10 |
+ <description>Simple all-in-memory XML parser coded in pure Java code so tools like Google's PlayN can transcompile it to all its target platforms</description> |
|
11 |
+ |
|
12 |
+ <licenses> |
|
13 |
+ <license> |
|
14 |
+ <name>The Apache Software License, Version 2.0</name> |
|
15 |
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> |
|
16 |
+ <distribution>repo</distribution> |
|
17 |
+ </license> |
|
18 |
+ </licenses> |
|
19 |
+ |
|
20 |
+ <scm> |
|
21 |
+ <url>scm:git:https://github.com/asilvestre/jpurexml.git</url> |
|
22 |
+ <connection>scm:git:https://github.com/asilvestre/jpurexml.git</connection> |
|
23 |
+ </scm> |
|
24 |
+ |
|
25 |
+ <developers> |
|
26 |
+ <developer> |
|
27 |
+ <id>asilvestre</id> |
|
28 |
+ <name>Antoni Silvestre</name> |
|
29 |
+ <email>antoni.silvestre@gmail.com</email> |
|
30 |
+ </developer> |
|
31 |
+ </developers> |
|
32 |
+ |
|
33 |
+ <properties> |
|
34 |
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|
35 |
+ <github.global.server>github</github.global.server> |
|
36 |
+ </properties> |
|
37 |
+ |
|
38 |
+ <build> |
|
39 |
+ <plugins> |
|
40 |
+ <plugin> |
|
41 |
+ <groupId>org.apache.maven.plugins</groupId> |
|
42 |
+ <artifactId>maven-compiler-plugin</artifactId> |
|
43 |
+ <configuration> |
|
44 |
+ <source>1.5</source> |
|
45 |
+ <target>1.5</target> |
|
46 |
+ </configuration> |
|
47 |
+ </plugin> |
|
48 |
+ <plugin> |
|
49 |
+ <groupId>com.github.github</groupId> |
|
50 |
+ <artifactId>site-maven-plugin</artifactId> |
|
51 |
+ <version>0.7</version> |
|
52 |
+ <configuration> |
|
53 |
+ <message>Creating Javadocs for ${project.version}</message> |
|
54 |
+ <repositoryName>JavaXmlToJson</repositoryName> |
|
55 |
+ <repositoryOwner>antonisilvestre</repositoryOwner> |
|
56 |
+ </configuration> |
|
57 |
+ <executions> |
|
58 |
+ <execution> |
|
59 |
+ <goals> |
|
60 |
+ <goal>site</goal> |
|
61 |
+ </goals> |
|
62 |
+ <phase>site</phase> |
|
63 |
+ </execution> |
|
64 |
+ </executions> |
|
65 |
+ </plugin> |
|
66 |
+ <plugin> |
|
67 |
+ <groupId>org.apache.maven.plugins</groupId> |
|
68 |
+ <artifactId>maven-gpg-plugin</artifactId> |
|
69 |
+ <executions> |
|
70 |
+ <execution> |
|
71 |
+ <id>sign-artifacts</id> |
|
72 |
+ <phase>verify</phase> |
|
73 |
+ <goals> |
|
74 |
+ <goal>sign</goal> |
|
75 |
+ </goals> |
|
76 |
+ </execution> |
|
77 |
+ </executions> |
|
78 |
+ </plugin> |
|
79 |
+ </plugins> |
|
80 |
+ </build> |
|
81 |
+ |
|
82 |
+ <reporting> |
|
83 |
+ <plugins> |
|
84 |
+ <plugin> |
|
85 |
+ <groupId>org.apache.maven.plugins</groupId> |
|
86 |
+ <artifactId>maven-javadoc-plugin</artifactId> |
|
87 |
+ <version>2.9</version> |
|
88 |
+ <configuration> |
|
89 |
+ </configuration> |
|
90 |
+ </plugin> |
|
91 |
+ </plugins> |
|
92 |
+ </reporting> |
|
93 |
+ |
|
94 |
+ <dependencies> |
|
95 |
+ <dependency> |
|
96 |
+ <groupId>junit</groupId> |
|
97 |
+ <artifactId>junit</artifactId> |
|
98 |
+ <version>3.8.1</version> |
|
99 |
+ <scope>test</scope> |
|
100 |
+ </dependency> |
|
101 |
+ <dependency> |
|
102 |
+ <groupId>com.github.github</groupId> |
|
103 |
+ <artifactId>site-maven-plugin</artifactId> |
|
104 |
+ <version>0.7</version> |
|
105 |
+ </dependency> |
|
106 |
+ </dependencies> |
|
107 |
+</project> |
0 | 108 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,48 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+/** |
|
20 |
+ * Generic utilities for the XML parser and the JSON converter |
|
21 |
+ */ |
|
22 |
+public class Utils { |
|
23 |
+ |
|
24 |
+ /** |
|
25 |
+ * Simple replace string function, String.ReplaceAll uses regular expressions internally and might not |
|
26 |
+ * transcompile correctly to all platforms |
|
27 |
+ * @param input |
|
28 |
+ * @param toReplace |
|
29 |
+ * @param replacement |
|
30 |
+ * @return |
|
31 |
+ */ |
|
32 |
+ public static String ReplaceStr(String input, String toReplace, String replacement) |
|
33 |
+ { |
|
34 |
+ String res = input; |
|
35 |
+ |
|
36 |
+ int replacePos = input.indexOf(toReplace); |
|
37 |
+ while (replacePos != -1) |
|
38 |
+ { |
|
39 |
+ res = String.format("%s%s%s", res.substring(0, replacePos), replacement, |
|
40 |
+ res.substring(replacePos + toReplace.length())); |
|
41 |
+ |
|
42 |
+ replacePos = res.indexOf(toReplace, replacePos + replacement.length()); |
|
43 |
+ } |
|
44 |
+ |
|
45 |
+ return res; |
|
46 |
+ } |
|
47 |
+ |
|
48 |
+} |
0 | 49 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,40 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+/** |
|
20 |
+ * Main object that describes an XML document |
|
21 |
+ */ |
|
22 |
+public class XmlDoc { |
|
23 |
+ /** |
|
24 |
+ * Has the XML prologue, that is the initial '<?xml' tag with its version |
|
25 |
+ * and encoding |
|
26 |
+ */ |
|
27 |
+ public XmlPrologue prologue = new XmlPrologue(); |
|
28 |
+ |
|
29 |
+ /** |
|
30 |
+ * Has the root tag for the XML document |
|
31 |
+ */ |
|
32 |
+ public XmlTag root = new XmlTag(); |
|
33 |
+ |
|
34 |
+ @Override |
|
35 |
+ public String toString() { |
|
36 |
+ String res = String.format("%s%s", prologue.toString(), root.toString()); |
|
37 |
+ |
|
38 |
+ return res; |
|
39 |
+ } |
|
40 |
+} |
0 | 41 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,40 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+/** |
|
20 |
+ * |
|
21 |
+ */ |
|
22 |
+public class XmlParseException extends Exception{ |
|
23 |
+ |
|
24 |
+ private static final long serialVersionUID = 1906896722521922104L; |
|
25 |
+ |
|
26 |
+ public XmlParseException(String msg, int pos) |
|
27 |
+ { |
|
28 |
+ super(String.format("%s at %d", msg, pos)); |
|
29 |
+ |
|
30 |
+ this.pos = pos; |
|
31 |
+ } |
|
32 |
+ |
|
33 |
+ public int GetPos() |
|
34 |
+ { |
|
35 |
+ return pos; |
|
36 |
+ } |
|
37 |
+ |
|
38 |
+ private int pos; |
|
39 |
+ |
|
40 |
+} |
0 | 41 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,875 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+import java.util.HashMap; |
|
20 |
+import java.util.LinkedList; |
|
21 |
+import java.util.TreeSet; |
|
22 |
+ |
|
23 |
+/** |
|
24 |
+ * Class to hold static methods to parse an XML string. |
|
25 |
+ */ |
|
26 |
+public class XmlParser { |
|
27 |
+ |
|
28 |
+ /** |
|
29 |
+ * Parse a String with an XML into an XmlDoc object |
|
30 |
+ * |
|
31 |
+ * @param xml |
|
32 |
+ * @throws XmlParseException |
|
33 |
+ */ |
|
34 |
+ public static XmlDoc parseXml(String xml) throws XmlParseException { |
|
35 |
+ XmlDoc res = new XmlDoc(); |
|
36 |
+ |
|
37 |
+ // First of all removing all the comments from the XML |
|
38 |
+ String procXml = removeComments(xml); |
|
39 |
+ |
|
40 |
+ // Parsing XML prologue |
|
41 |
+ int prologEnd = parsePrologue(procXml, res.prologue); |
|
42 |
+ |
|
43 |
+ // Parsing the XML body |
|
44 |
+ parseTag(procXml, prologEnd, res.root); |
|
45 |
+ |
|
46 |
+ return res; |
|
47 |
+ } |
|
48 |
+ |
|
49 |
+ /** |
|
50 |
+ * Remove XML comments |
|
51 |
+ * |
|
52 |
+ * @param xml |
|
53 |
+ * @return input XML without comments |
|
54 |
+ */ |
|
55 |
+ private static String removeComments(String xml) throws XmlParseException { |
|
56 |
+ String res = xml; |
|
57 |
+ |
|
58 |
+ int commentPos = res.indexOf("<!--"); |
|
59 |
+ while (commentPos != -1) { |
|
60 |
+ int commentEnd = res.indexOf("-->"); |
|
61 |
+ |
|
62 |
+ if (commentEnd == -1) { |
|
63 |
+ throw new XmlParseException("Missing comment ending '-->'", commentPos); |
|
64 |
+ } |
|
65 |
+ |
|
66 |
+ res = String.format("%s%s", res.substring(0, commentPos), res.substring(commentEnd + 3)); |
|
67 |
+ |
|
68 |
+ commentPos = res.indexOf("<!--"); |
|
69 |
+ } |
|
70 |
+ |
|
71 |
+ return res; |
|
72 |
+ } |
|
73 |
+ |
|
74 |
+ /** |
|
75 |
+ * Parses the first line of each XML which states its version and encoding |
|
76 |
+ * |
|
77 |
+ * @param xml |
|
78 |
+ * @param prologue |
|
79 |
+ * @return position where the encoding finishes |
|
80 |
+ */ |
|
81 |
+ private static int parsePrologue(String xml, XmlPrologue prologue) { |
|
82 |
+ prologue.version = "1.0"; |
|
83 |
+ prologue.encoding = "UTF-8"; |
|
84 |
+ |
|
85 |
+ int prologueStart = xml.indexOf("<?xml"); |
|
86 |
+ int prologueEnd = xml.indexOf("?>", prologueStart); |
|
87 |
+ |
|
88 |
+ if (prologueStart != -1 && prologueEnd != -1) { |
|
89 |
+ String prologueString = xml.substring(prologueStart + 5, prologueEnd); |
|
90 |
+ |
|
91 |
+ HashMap<String, String> prologueAttrs = new HashMap<String, String>(); |
|
92 |
+ parseAttributeList(prologueString, 0, prologueAttrs); |
|
93 |
+ |
|
94 |
+ if (prologueAttrs.containsKey("version")) { |
|
95 |
+ prologue.version = prologueAttrs.get("version"); |
|
96 |
+ } |
|
97 |
+ |
|
98 |
+ if (prologueAttrs.containsKey("encoding")) { |
|
99 |
+ prologue.encoding = prologueAttrs.get("encoding"); |
|
100 |
+ } |
|
101 |
+ } |
|
102 |
+ |
|
103 |
+ return prologueEnd != -1 ? prologueEnd + "?>".length() : 0; |
|
104 |
+ } |
|
105 |
+ |
|
106 |
+ /** |
|
107 |
+ * Parse a tag and its children |
|
108 |
+ * |
|
109 |
+ * @param xml |
|
110 |
+ * @param pointer |
|
111 |
+ * position from where to start parsing |
|
112 |
+ * @param tag |
|
113 |
+ * output parameter where the tag information will be put |
|
114 |
+ * @return position where it has stopped parsing |
|
115 |
+ * @throws XmlParseException |
|
116 |
+ */ |
|
117 |
+ private static int parseTag(String xml, int pointer, XmlTag tag) throws XmlParseException { |
|
118 |
+ int res; |
|
119 |
+ |
|
120 |
+ // Parsing the name and attributes of the tag |
|
121 |
+ int headerEnd = parseTagHeader(xml, pointer, tag); |
|
122 |
+ |
|
123 |
+ // If the tag wasn't an empty tag (finishes right away with a '/>') look |
|
124 |
+ // for children content |
|
125 |
+ if (!tag.empty) { |
|
126 |
+ // First looking for tag content which is not a children XML tag |
|
127 |
+ int childrenPos = parseTagContent(xml, headerEnd, tag); |
|
128 |
+ |
|
129 |
+ while (hasChildren(xml, childrenPos, tag.name)) { |
|
130 |
+ XmlTag child = new XmlTag(); |
|
131 |
+ childrenPos = parseTag(xml, childrenPos, child); |
|
132 |
+ |
|
133 |
+ tag.children.add(child); |
|
134 |
+ |
|
135 |
+ // As far as I know there could be child tags and content mixed, |
|
136 |
+ // TODO: I am not planning on using this, so I'll just append |
|
137 |
+ // all the content in one string |
|
138 |
+ // but I'm not preserving the order between tags and chunks of |
|
139 |
+ // content |
|
140 |
+ childrenPos = parseTagContent(xml, childrenPos, tag); |
|
141 |
+ } |
|
142 |
+ |
|
143 |
+ res = parseEndTag(xml, childrenPos, tag.name); |
|
144 |
+ } else { |
|
145 |
+ res = headerEnd; |
|
146 |
+ } |
|
147 |
+ |
|
148 |
+ return res; |
|
149 |
+ } |
|
150 |
+ |
|
151 |
+ /** |
|
152 |
+ * All the different states the tag header parser can be in. |
|
153 |
+ */ |
|
154 |
+ private enum TagHeaderStates { |
|
155 |
+ Init, TagStart, Name, AttrList, EmptyTagEnd, End, Invalid |
|
156 |
+ } |
|
157 |
+ |
|
158 |
+ /** |
|
159 |
+ * Enumeration with all the possible events we can receive when parsing an |
|
160 |
+ * tag header. |
|
161 |
+ */ |
|
162 |
+ private enum TagHeaderActions { |
|
163 |
+ Space, TagInit, NameChar, Slash, TagEnd, Invalid |
|
164 |
+ } |
|
165 |
+ |
|
166 |
+ /** |
|
167 |
+ * Structure that describes the tag header parser state machine. There is an |
|
168 |
+ * array entry for each header parser state, and for each of these there's |
|
169 |
+ * an array with an entry for each tag header parser action that describes |
|
170 |
+ * to which state should go when receiving that action. For instance in the |
|
171 |
+ * tag header name state if we receive another character we stay in the tag |
|
172 |
+ * header name state, but if we receive a ' ' we go the attribute list |
|
173 |
+ * state. |
|
174 |
+ */ |
|
175 |
+ private static final TagHeaderStates[][] TagHeaderStateMachine = new TagHeaderStates[][] { |
|
176 |
+ // Init state |
|
177 |
+ { TagHeaderStates.Init, TagHeaderStates.TagStart, TagHeaderStates.Invalid, TagHeaderStates.Invalid, |
|
178 |
+ TagHeaderStates.Invalid, TagHeaderStates.Invalid }, |
|
179 |
+ // Tag start state '<' |
|
180 |
+ { TagHeaderStates.TagStart, TagHeaderStates.Invalid, TagHeaderStates.Name, TagHeaderStates.Invalid, |
|
181 |
+ TagHeaderStates.Invalid, TagHeaderStates.Invalid }, |
|
182 |
+ // Tag name state '<' + ' tagname ' |
|
183 |
+ { TagHeaderStates.AttrList, TagHeaderStates.Invalid, TagHeaderStates.Name, TagHeaderStates.EmptyTagEnd, |
|
184 |
+ TagHeaderStates.End, TagHeaderStates.Invalid }, |
|
185 |
+ // Attribute list state 'key='val' key2='val'' (this will be |
|
186 |
+ // processed in its own state machine) |
|
187 |
+ { TagHeaderStates.AttrList, TagHeaderStates.Invalid, TagHeaderStates.Invalid, TagHeaderStates.EmptyTagEnd, |
|
188 |
+ TagHeaderStates.End, TagHeaderStates.Invalid }, |
|
189 |
+ // Empty tag end state, '/' + '>' |
|
190 |
+ { TagHeaderStates.Invalid, TagHeaderStates.Invalid, TagHeaderStates.Invalid, TagHeaderStates.Invalid, |
|
191 |
+ TagHeaderStates.End, TagHeaderStates.Invalid }, }; |
|
192 |
+ |
|
193 |
+ /** |
|
194 |
+ * Attribute parser data such as where is the position where the attribute |
|
195 |
+ * name starts and so forth |
|
196 |
+ */ |
|
197 |
+ private static class TagHeaderParserData { |
|
198 |
+ public int nameStart = 0; |
|
199 |
+ public int nameEnd = 0; |
|
200 |
+ public HashMap<String, String> attributes = new HashMap<String, String>(); |
|
201 |
+ public boolean empty = false; |
|
202 |
+ } |
|
203 |
+ |
|
204 |
+ /** |
|
205 |
+ * Parse the tag name and attribute list |
|
206 |
+ * |
|
207 |
+ * @param xml |
|
208 |
+ * @param pointer |
|
209 |
+ * @param tag |
|
210 |
+ * @return position from where to continue parsing |
|
211 |
+ * @throws XmlParseException |
|
212 |
+ */ |
|
213 |
+ private static int parseTagHeader(String xml, int pointer, XmlTag tag) throws XmlParseException { |
|
214 |
+ TagHeaderStates state = TagHeaderStates.Init; |
|
215 |
+ TagHeaderParserData parserData = new TagHeaderParserData(); |
|
216 |
+ |
|
217 |
+ int i = pointer; |
|
218 |
+ boolean done = i >= xml.length(); |
|
219 |
+ while (!done) { |
|
220 |
+ // From the current character determine its corresponding action in |
|
221 |
+ // the state machine |
|
222 |
+ char nextChar = xml.charAt(i); |
|
223 |
+ TagHeaderActions action = parseCharIntoTagHeaderAction(nextChar); |
|
224 |
+ |
|
225 |
+ // Apply the action to the current state of the state machine and |
|
226 |
+ // obtain its resulting new state |
|
227 |
+ TagHeaderStates newState = TagHeaderStateMachine[state.ordinal()][action.ordinal()]; |
|
228 |
+ |
|
229 |
+ // Process this state transition |
|
230 |
+ if (state != newState) // In this parser interesting stuff only |
|
231 |
+ // happens when we change state |
|
232 |
+ { |
|
233 |
+ i = processTagHeaderStateTransition(xml, i, state, newState, parserData); |
|
234 |
+ } else { |
|
235 |
+ i++; |
|
236 |
+ } |
|
237 |
+ |
|
238 |
+ state = newState; |
|
239 |
+ |
|
240 |
+ done = i >= xml.length() || state == TagHeaderStates.End || state == TagHeaderStates.Invalid; |
|
241 |
+ } |
|
242 |
+ |
|
243 |
+ // If the tag header parsing was successful store the name and |
|
244 |
+ // attributes in the XML tag object |
|
245 |
+ if (state == TagHeaderStates.End) { |
|
246 |
+ String name = xml.substring(parserData.nameStart, parserData.nameEnd); |
|
247 |
+ |
|
248 |
+ tag.name = name; |
|
249 |
+ tag.attributes = parserData.attributes; |
|
250 |
+ tag.empty = parserData.empty; |
|
251 |
+ } else { |
|
252 |
+ throw new XmlParseException("Error parsing tag header", i); |
|
253 |
+ } |
|
254 |
+ |
|
255 |
+ return i; |
|
256 |
+ } |
|
257 |
+ |
|
258 |
+ /** |
|
259 |
+ * Convert a char to its corresponding TagHeaderAction for the tag header |
|
260 |
+ * parser state machine |
|
261 |
+ * |
|
262 |
+ * @param c |
|
263 |
+ * Character to parse |
|
264 |
+ * @return The corresponding TagHeaderAction for the input character |
|
265 |
+ */ |
|
266 |
+ private static TagHeaderActions parseCharIntoTagHeaderAction(char c) { |
|
267 |
+ // By default we mark it as a valid value character |
|
268 |
+ TagHeaderActions res = TagHeaderActions.NameChar; |
|
269 |
+ |
|
270 |
+ // Checking if it's some form of whitespace |
|
271 |
+ if (Character.isWhitespace(c)) { |
|
272 |
+ res = TagHeaderActions.Space; |
|
273 |
+ } else if (c == '<') { |
|
274 |
+ res = TagHeaderActions.TagInit; |
|
275 |
+ } else if (c == '>') { |
|
276 |
+ res = TagHeaderActions.TagEnd; |
|
277 |
+ } else if (c == '/') { |
|
278 |
+ res = TagHeaderActions.Slash; |
|
279 |
+ } else if (c == '\'' || c == '"') { |
|
280 |
+ res = TagHeaderActions.Invalid; |
|
281 |
+ } |
|
282 |
+ |
|
283 |
+ return res; |
|
284 |
+ } |
|
285 |
+ |
|
286 |
+ /** |
|
287 |
+ * Process a state transition |
|
288 |
+ * |
|
289 |
+ * @param pos |
|
290 |
+ * Current parsing position |
|
291 |
+ * @param from |
|
292 |
+ * Old state |
|
293 |
+ * @param to |
|
294 |
+ * New state |
|
295 |
+ * @param parserData |
|
296 |
+ * Here it will be stored name and value positions as they are |
|
297 |
+ * found |
|
298 |
+ * @return the position from where to continue parsing |
|
299 |
+ */ |
|
300 |
+ private static int processTagHeaderStateTransition(String xml, int pos, TagHeaderStates from, TagHeaderStates to, |
|
301 |
+ TagHeaderParserData parserData) { |
|
302 |
+ // By default we continue parsing from the next character |
|
303 |
+ int res = pos + 1; |
|
304 |
+ |
|
305 |
+ // Transition from a non-name state to a name state, we store the |
|
306 |
+ // initial position of the name |
|
307 |
+ if (from != TagHeaderStates.Name && to == TagHeaderStates.Name) { |
|
308 |
+ parserData.nameStart = pos; |
|
309 |
+ } |
|
310 |
+ // Transition from a name state to a non-name state, we store the final |
|
311 |
+ // position of the name |
|
312 |
+ else if (from == TagHeaderStates.Name && to != TagHeaderStates.Name) { |
|
313 |
+ parserData.nameEnd = pos; |
|
314 |
+ } |
|
315 |
+ |
|
316 |
+ // Parse the attribute list, it has its own parser, it will return the |
|
317 |
+ // position from where to continue parsing |
|
318 |
+ if (from != TagHeaderStates.AttrList && to == TagHeaderStates.AttrList) { |
|
319 |
+ res = parseAttributeList(xml, pos, parserData.attributes); |
|
320 |
+ } |
|
321 |
+ |
|
322 |
+ // If we find a '/' it means this tag has no body |
|
323 |
+ if (to == TagHeaderStates.EmptyTagEnd) { |
|
324 |
+ parserData.empty = true; |
|
325 |
+ } |
|
326 |
+ |
|
327 |
+ return res; |
|
328 |
+ } |
|
329 |
+ |
|
330 |
+/** |
|
331 |
+ * Checks if the next tag is an ending tag with the name of the parent |
|
332 |
+ * @param xml |
|
333 |
+ * @param pointer position starting with a '<' in xml |
|
334 |
+ * @param parentName name of the parent tag to check if it has children |
|
335 |
+ * @return if there are tags before the ending tag of the parent |
|
336 |
+ */ |
|
337 |
+ private static boolean hasChildren(String xml, int pointer, String parentName) { |
|
338 |
+ boolean res = false; |
|
339 |
+ |
|
340 |
+ try { |
|
341 |
+ parseEndTag(xml, pointer, parentName); |
|
342 |
+ } catch (XmlParseException e) { |
|
343 |
+ res = true; |
|
344 |
+ } |
|
345 |
+ |
|
346 |
+ return res; |
|
347 |
+ } |
|
348 |
+ |
|
349 |
+ /** |
|
350 |
+ * Check this tag is the end tag for a specific parent tag |
|
351 |
+ * |
|
352 |
+ * @param xml |
|
353 |
+ * @param pointer |
|
354 |
+ * points to a tag that should be the end tag for tagName |
|
355 |
+ * @param tagName |
|
356 |
+ * @return position from where to continue parsing |
|
357 |
+ */ |
|
358 |
+ private static int parseEndTag(String xml, int pointer, String tagName) throws XmlParseException { |
|
359 |
+ int res; |
|
360 |
+ |
|
361 |
+ boolean correct = xml.startsWith("</", pointer); |
|
362 |
+ |
|
363 |
+ // Getting everything between the initial '</' and a '>' |
|
364 |
+ int endPos = xml.indexOf(">", pointer); |
|
365 |
+ if (correct) { |
|
366 |
+ correct = correct && endPos != -1; |
|
367 |
+ } |
|
368 |
+ |
|
369 |
+ if (correct) { |
|
370 |
+ String potentialParentEndTag = xml.substring(pointer + "</".length(), endPos); |
|
371 |
+ |
|
372 |
+ // Trimming any spaces before and after the string we have generated |
|
373 |
+ potentialParentEndTag = potentialParentEndTag.trim(); |
|
374 |
+ |
|
375 |
+ // Here we should have the name of the parent tag |
|
376 |
+ correct = tagName.equals(potentialParentEndTag); |
|
377 |
+ } |
|
378 |
+ |
|
379 |
+ if (correct) { |
|
380 |
+ res = endPos + 1; |
|
381 |
+ } else { |
|
382 |
+ throw new XmlParseException(String.format("Expecting end tag <%s/>", tagName), pointer); |
|
383 |
+ } |
|
384 |
+ |
|
385 |
+ return res; |
|
386 |
+ } |
|
387 |
+ |
|
388 |
+ /** |
|
389 |
+ * All the different states the attribute parser can be in. |
|
390 |
+ */ |
|
391 |
+ private enum AttrStates { |
|
392 |
+ Init, Name, PreSeparator, Separator, PostSeparator, SingleQuotedContent, DoubleQuotedContent, End, Invalid |
|
393 |
+ } |
|
394 |
+ |
|
395 |
+ /** |
|
396 |
+ * Enumeration with all the possible events we can receive when parsing an |
|
397 |
+ * attribute. |
|
398 |
+ */ |
|
399 |
+ private enum AttrActions { |
|
400 |
+ Space, NameChar, Separator, SingleQuote, DoubleQuote, Slash, Invalid |
|
401 |
+ } |
|
402 |
+ |
|
403 |
+ /** |
|
404 |
+ * Structure that describes the attribute parser state machine. There is an |
|
405 |
+ * array entry for each attribute parser state, and for each of these |
|
406 |
+ * there's an array with an entry for each attribute parser action that |
|
407 |
+ * describes to which state should go when receiving that action. For |
|
408 |
+ * instance in the attribute name state if we receive another character we |
|
409 |
+ * stay in the attribute name state, but if we receive an '=' we go the |
|
410 |
+ * attribute separator state. |
|
411 |
+ */ |
|
412 |
+ private static final AttrStates[][] AttrStateMachine = new AttrStates[][] { |
|
413 |
+ // Init state |
|
414 |
+ { AttrStates.Init, AttrStates.Name, AttrStates.Invalid, AttrStates.Invalid, AttrStates.Invalid, |
|
415 |
+ AttrStates.Invalid, AttrStates.Invalid }, |
|
416 |
+ // Attribute name state |
|
417 |
+ { AttrStates.PreSeparator, AttrStates.Name, AttrStates.Separator, AttrStates.Invalid, AttrStates.Invalid, |
|
418 |
+ AttrStates.Invalid, AttrStates.Invalid }, |
|
419 |
+ // Attribute pre separator state |
|
420 |
+ { AttrStates.PreSeparator, AttrStates.Invalid, AttrStates.Separator, AttrStates.Invalid, |
|
421 |
+ AttrStates.Invalid, AttrStates.Invalid, AttrStates.Invalid }, |
|
422 |
+ // Attribute separator state |
|
423 |
+ { AttrStates.PostSeparator, AttrStates.Invalid, AttrStates.Invalid, AttrStates.SingleQuotedContent, |
|
424 |
+ AttrStates.DoubleQuotedContent, AttrStates.Invalid, AttrStates.Invalid }, |
|
425 |
+ // Attribute separator post state |
|
426 |
+ { AttrStates.PostSeparator, AttrStates.Invalid, AttrStates.Invalid, AttrStates.SingleQuotedContent, |
|
427 |
+ AttrStates.DoubleQuotedContent, AttrStates.Invalid, AttrStates.Invalid }, |
|
428 |
+ // Single quoted content state |
|
429 |
+ { AttrStates.SingleQuotedContent, AttrStates.SingleQuotedContent, AttrStates.SingleQuotedContent, |
|
430 |
+ AttrStates.End, AttrStates.SingleQuotedContent, AttrStates.SingleQuotedContent, AttrStates.Invalid }, |
|
431 |
+ // Double quoted content state |
|
432 |
+ { AttrStates.DoubleQuotedContent, AttrStates.DoubleQuotedContent, AttrStates.DoubleQuotedContent, |
|
433 |
+ AttrStates.DoubleQuotedContent, AttrStates.End, AttrStates.DoubleQuotedContent, AttrStates.Invalid }, }; |
|
434 |
+ |
|
435 |
+ /** |
|
436 |
+ * Attribute parser data such as where is the position where the attribute |
|
437 |
+ * name starts and so forth |
|
438 |
+ */ |
|
439 |
+ private static class AttrParserData { |
|
440 |
+ public int nameStart = 0; |
|
441 |
+ public int nameEnd = 0; |
|
442 |
+ public int valueStart = 0; |
|
443 |
+ public int valueEnd = 0; |
|
444 |
+ } |
|
445 |
+ |
|
446 |
+ /** |
|
447 |
+ * Parse an attribute list, if it doesn't find anything or finds something |
|
448 |
+ * not belonging to an attribute list returns with the position of the |
|
449 |
+ * offending character, in the meantime it will have filled the attributes |
|
450 |
+ * hashtable argument with all the attributes it has found. |
|
451 |
+ * |
|
452 |
+ * @param xml |
|
453 |
+ * String to look for an attribute list |
|
454 |
+ * @param pointer |
|
455 |
+ * Position from where to start parsing |
|
456 |
+ * @param attributes |
|
457 |
+ * Output parameter to place all key-value entries with the |
|
458 |
+ * attributes found |
|
459 |
+ * @return the position for the parser to continue on |
|
460 |
+ */ |
|
461 |
+ private static int parseAttributeList(String xml, int pointer, HashMap<String, String> attributes) { |
|
462 |
+ int i = pointer; |
|
463 |
+ boolean done = false; |
|
464 |
+ |
|
465 |
+ // Go parsing attributes until we find something it is not an XML tag |
|
466 |
+ // attribute |
|
467 |
+ do { |
|
468 |
+ AttrStates state = AttrStates.Init; |
|
469 |
+ AttrParserData parserData = new AttrParserData(); |
|
470 |
+ boolean attrDone = i >= xml.length() || state == AttrStates.End || state == AttrStates.Invalid; |
|
471 |
+ while (!attrDone) { |
|
472 |
+ // From the current character determine its corresponding action |
|
473 |
+ // in the state machine |
|
474 |
+ char nextChar = xml.charAt(i); |
|
475 |
+ AttrActions action = parseCharIntoAttrAction(nextChar); |
|
476 |
+ |
|
477 |
+ // Apply the action to the current state of the state machine |
|
478 |
+ // and obtain its resulting new state |
|
479 |
+ AttrStates newState = AttrStateMachine[state.ordinal()][action.ordinal()]; |
|
480 |
+ |
|
481 |
+ // Process this state transition |
|
482 |
+ if (state != newState) // In this parser interesting stuff only |
|
483 |
+ // happens when we change state |
|
484 |
+ { |
|
485 |
+ i = processAttrStateTransition(i, state, newState, parserData); |
|
486 |
+ } else { |
|
487 |
+ i++; |
|
488 |
+ } |
|
489 |
+ |
|
490 |
+ state = newState; |
|
491 |
+ |
|
492 |
+ attrDone = i >= xml.length() || state == AttrStates.End || state == AttrStates.Invalid; |
|
493 |
+ } |
|
494 |
+ |
|
495 |
+ // If the attribute parsing was successful store it in the hash |
|
496 |
+ // table |
|
497 |
+ if (state == AttrStates.End) { |
|
498 |
+ String name = xml.substring(parserData.nameStart, parserData.nameEnd); |
|
499 |
+ String value = xml.substring(parserData.valueStart, parserData.valueEnd); |
|
500 |
+ |
|
501 |
+ // Escaping value literal |
|
502 |
+ value = unescapeXmlLiteral(value); |
|
503 |
+ |
|
504 |
+ attributes.put(name, value); |
|
505 |
+ } |
|
506 |
+ |
|
507 |
+ done = i >= xml.length() || state == AttrStates.Invalid; |
|
508 |
+ } while (!done); |
|
509 |
+ |
|
510 |
+ return i; |
|
511 |
+ } |
|
512 |
+ |
|
513 |
+ /** |
|
514 |
+ * Convert a char to its corresponding AttrAction for the attribute parser |
|
515 |
+ * state machine |
|
516 |
+ * |
|
517 |
+ * @param c |
|
518 |
+ * Character to parse |
|
519 |
+ * @return The corresponding AttrAction for the intut character |
|
520 |
+ */ |
|
521 |
+ private static AttrActions parseCharIntoAttrAction(char c) { |
|
522 |
+ // By default we mark it as a valid value character |
|
523 |
+ AttrActions res = AttrActions.NameChar; |
|
524 |
+ |
|
525 |
+ // Checking if it's some form of whitespace |
|
526 |
+ if (Character.isWhitespace(c)) { |
|
527 |
+ res = AttrActions.Space; |
|
528 |
+ } |
|
529 |
+ // For this parser '<' and '>' are invalid |
|
530 |
+ else if (c == '<' || c == '>') { |
|
531 |
+ res = AttrActions.Invalid; |
|
532 |
+ } |
|
533 |
+ // '=' separates the name of the attribute and its value |
|
534 |
+ else if (c == '=') { |
|
535 |
+ res = AttrActions.Separator; |
|
536 |
+ } |
|
537 |
+ // Values can be enclosed in single and double quotes |
|
538 |
+ else if (c == '\'') { |
|
539 |
+ res = AttrActions.SingleQuote; |
|
540 |
+ } else if (c == '"') { |
|
541 |
+ res = AttrActions.DoubleQuote; |
|
542 |
+ } else if (c == '/') { |
|
543 |
+ res = AttrActions.Slash; |
|
544 |
+ } |
|
545 |
+ |
|
546 |
+ return res; |
|
547 |
+ } |
|
548 |
+ |
|
549 |
+ /** |
|
550 |
+ * Process a state transition |
|
551 |
+ * |
|
552 |
+ * @param pos |
|
553 |
+ * Current parsing position |
|
554 |
+ * @param from |
|
555 |
+ * Old state |
|
556 |
+ * @param to |
|
557 |
+ * New state |
|
558 |
+ * @param parserData |
|
559 |
+ * Here it will be stored name and value positions as they are |
|
560 |
+ * found |
|
561 |
+ */ |
|
562 |
+ private static int processAttrStateTransition(int pos, AttrStates from, AttrStates to, AttrParserData parserData) { |
|
563 |
+ int res = pos + 1; |
|
564 |
+ |
|
565 |
+ // Transition from a non-name state to a name state, we store the |
|
566 |
+ // initial position of the name |
|
567 |
+ if (from != AttrStates.Name && to == AttrStates.Name) { |
|
568 |
+ parserData.nameStart = pos; |
|
569 |
+ } |
|
570 |
+ // Transition from a name state to a non-name state, we store the final |
|
571 |
+ // position of the name |
|
572 |
+ else if (from == AttrStates.Name && to != AttrStates.Name) { |
|
573 |
+ parserData.nameEnd = pos; |
|
574 |
+ } |
|
575 |
+ // Transition from a non-value state to a value state (single or double |
|
576 |
+ // quoted), store the initial value pos |
|
577 |
+ else if ((from != AttrStates.SingleQuotedContent && to == AttrStates.SingleQuotedContent) |
|
578 |
+ || (from != AttrStates.DoubleQuotedContent && to == AttrStates.DoubleQuotedContent)) { |
|
579 |
+ parserData.valueStart = pos + 1; |
|
580 |
+ } |
|
581 |
+ // Transition from a value state (single or double quoted) to a non |
|
582 |
+ // value state |
|
583 |
+ else if ((from == AttrStates.SingleQuotedContent && to == AttrStates.End) |
|
584 |
+ || (from == AttrStates.DoubleQuotedContent && to == AttrStates.End)) { |
|
585 |
+ parserData.valueEnd = pos; |
|
586 |
+ } |
|
587 |
+ |
|
588 |
+ // When we are in the invalid state here means this is not part of the |
|
589 |
+ // attribute list |
|
590 |
+ if (to == AttrStates.Invalid) { |
|
591 |
+ res = pos; |
|
592 |
+ } |
|
593 |
+ |
|
594 |
+ return res; |
|
595 |
+ } |
|
596 |
+ |
|
597 |
+ /** |
|
598 |
+ * All the different states the tag header parser can be in. |
|
599 |
+ */ |
|
600 |
+ private enum TagContentStates { |
|
601 |
+ Content, Gt, CDATA, End, Invalid |
|
602 |
+ } |
|
603 |
+ |
|
604 |
+ /** |
|
605 |
+ * Enumeration with all the possible events we can receive when parsing an |
|
606 |
+ * tag header. |
|
607 |
+ */ |
|
608 |
+ private enum TagContentActions { |
|
609 |
+ Char, TagInit, Exclamation, Invalid |
|
610 |
+ } |
|
611 |
+ |
|
612 |
+ /** |
|
613 |
+ * Structure that describes the tag content parser state machine. |
|
614 |
+ */ |
|
615 |
+ private static final TagContentStates[][] TagContentStateMachine = new TagContentStates[][] { |
|
616 |
+ // Content state |
|
617 |
+ { TagContentStates.Content, TagContentStates.Gt, TagContentStates.Invalid, TagContentStates.Invalid }, |
|
618 |
+ // GT state (a '<' has been found and we need to decide if it's a |
|
619 |
+ // new tag or a CDATA) |
|
620 |
+ { TagContentStates.End, TagContentStates.Invalid, TagContentStates.CDATA, TagContentStates.Invalid }, |
|
621 |
+ // CDATA state (it has it's own parser) |
|
622 |
+ { TagContentStates.Content, TagContentStates.Gt, TagContentStates.Invalid, TagContentStates.Invalid }, }; |
|
623 |
+ |
|
624 |
+ /** |
|
625 |
+ * Tag content parser data It contains all the data fragments it has found |
|
626 |
+ * along the way |
|
627 |
+ */ |
|
628 |
+ private static class TagContentParserData { |
|
629 |
+ public TagContentParserData(int pos) { |
|
630 |
+ lastContentStart = pos; |
|
631 |
+ } |
|
632 |
+ |
|
633 |
+ public LinkedList<String> contentBits = new LinkedList<String>(); |
|
634 |
+ public int lastContentStart; |
|
635 |
+ } |
|
636 |
+ |
|
637 |
+ /** |
|
638 |
+ * Parse the content of a tag |
|
639 |
+ * |
|
640 |
+ * @param xml |
|
641 |
+ * @param pointer |
|
642 |
+ * @param tag |
|
643 |
+ * @return position from where to continue parsing |
|
644 |
+ * @throws XmlParseException |
|
645 |
+ */ |
|
646 |
+ private static int parseTagContent(String xml, int pointer, XmlTag tag) throws XmlParseException { |
|
647 |
+ TagContentStates state = TagContentStates.Content; |
|
648 |
+ TagContentParserData parserData = new TagContentParserData(pointer); |
|
649 |
+ |
|
650 |
+ int i = pointer; |
|
651 |
+ boolean done = i >= xml.length(); |
|
652 |
+ while (!done) { |
|
653 |
+ // From the current character determine its corresponding action in |
|
654 |
+ // the state machine |
|
655 |
+ char nextChar = xml.charAt(i); |
|
656 |
+ TagContentActions action = parseCharIntoTagContentAction(nextChar); |
|
657 |
+ |
|
658 |
+ // Apply the action to the current state of the state machine and |
|
659 |
+ // obtain its resulting new state |
|
660 |
+ TagContentStates newState = TagContentStateMachine[state.ordinal()][action.ordinal()]; |
|
661 |
+ |
|
662 |
+ // Process this state transition |
|
663 |
+ if (state != newState) // In this parser interesting stuff only |
|
664 |
+ // happens when we change state |
|
665 |
+ { |
|
666 |
+ i = processTagContentStateTransition(xml, i, state, newState, parserData); |
|
667 |
+ } else { |
|
668 |
+ i++; |
|
669 |
+ } |
|
670 |
+ |
|
671 |
+ state = newState; |
|
672 |
+ |
|
673 |
+ done = i >= xml.length() || state == TagContentStates.End || state == TagContentStates.Invalid; |
|
674 |
+ } |
|
675 |
+ |
|
676 |
+ // If the tag content parsing was successful combine all the string bits |
|
677 |
+ // we have found into one |
|
678 |
+ if (state == TagContentStates.End) { |
|
679 |
+ String contentBit = parserData.contentBits.poll(); |
|
680 |
+ while (contentBit != null) { |
|
681 |
+ // TODO: Java seems to not have an efficient way of joining all |
|
682 |
+ // the strings into one using its |
|
683 |
+ // standard library, for now doing it like this |
|
684 |
+ tag.content += contentBit; |
|
685 |
+ |
|
686 |
+ contentBit = parserData.contentBits.poll(); |
|
687 |
+ } |
|
688 |
+ |
|
689 |
+ // We have to return a position minus two, because we have parsed a |
|
690 |
+ // '<' plus something else |
|
691 |
+ i -= 2; |
|
692 |
+ } else { |
|
693 |
+ throw new XmlParseException("Error parsing tag content", i); |
|
694 |
+ } |
|
695 |
+ |
|
696 |
+ return i; |
|
697 |
+ } |
|
698 |
+ |
|
699 |
+ /** |
|
700 |
+ * Convert a char to its corresponding TagContentAction for the tag content |
|
701 |
+ * parser state machine |
|
702 |
+ * |
|
703 |
+ * @param c |
|
704 |
+ * Character to parse |
|
705 |
+ * @return The corresponding TagContentAction for the input character |
|
706 |
+ */ |
|
707 |
+ private static TagContentActions parseCharIntoTagContentAction(char c) { |
|
708 |
+ // By default we mark it as a valid value character |
|
709 |
+ TagContentActions res = TagContentActions.Char; |
|
710 |
+ |
|
711 |
+ if (c == '<') { |
|
712 |
+ res = TagContentActions.TagInit; |
|
713 |
+ } else if (c == '!') { |
|
714 |
+ res = TagContentActions.Exclamation; |
|
715 |
+ } else if (c == '>' || c == '\'' || c == '"') { |
|
716 |
+ res = TagContentActions.Invalid; |
|
717 |
+ } |
|
718 |
+ |
|
719 |
+ return res; |
|
720 |
+ } |
|
721 |
+ |
|
722 |
+ /** |
|
723 |
+ * Process changing from one content tag parsing state to another |
|
724 |
+ * |
|
725 |
+ * @param xml |
|
726 |
+ * @param pos |
|
727 |
+ * @param from |
|
728 |
+ * @param to |
|
729 |
+ * @param parserData |
|
730 |
+ * @return Position from where to continue parsing |
|
731 |
+ * @throws XmlParseException |
|
732 |
+ */ |
|
733 |
+ private static int processTagContentStateTransition(String xml, int pos, TagContentStates from, |
|
734 |
+ TagContentStates to, TagContentParserData parserData) throws XmlParseException { |
|
735 |
+ int res = pos + 1; |
|
736 |
+ |
|
737 |
+ // Transition from a non-content state to a content state, we store the |
|
738 |
+ // initial position of this content bit |
|
739 |
+ if (from != TagContentStates.Content && to == TagContentStates.Content) { |
|
740 |
+ parserData.lastContentStart = pos; |
|
741 |
+ } |
|
742 |
+ // Transition from a content state to a non-content state, get the |
|
743 |
+ // substring for this content |
|
744 |
+ if (from == TagContentStates.Content && to != TagContentStates.Content) { |
|
745 |
+ if (pos != parserData.lastContentStart) { |
|
746 |
+ String contentBit = xml.substring(parserData.lastContentStart, pos); |
|
747 |
+ |
|
748 |
+ // Trimming initial and final spaces |
|
749 |
+ contentBit = contentBit.trim(); |
|
750 |
+ |
|
751 |
+ // unescaping string bit |
|
752 |
+ contentBit = unescapeXmlLiteral(contentBit); |
|
753 |
+ |
|
754 |
+ // Removing linefeeds and tabs |
|
755 |
+ contentBit = removeTabsAndLinefeeds(contentBit); |
|
756 |
+ |
|
757 |
+ parserData.contentBits.add(contentBit); |
|
758 |
+ } |
|
759 |
+ } |
|
760 |
+ // Transition to a CDATA state |
|
761 |
+ else if (from != TagContentStates.CDATA && to == TagContentStates.CDATA) { |
|
762 |
+ // We take away one from pos because pos has already passed over the |
|
763 |
+ // '<!' of the '<![CDATA[' |
|
764 |
+ res = parseCDATA(xml, pos - 1, parserData); |
|
765 |
+ } |
|
766 |
+ |
|
767 |
+ return res; |
|
768 |
+ } |
|
769 |
+ |
|
770 |
+ /** |
|
771 |
+ * Parses a CDATA piece of content |
|
772 |
+ * |
|
773 |
+ * @param xml |
|
774 |
+ * @param pos |
|
775 |
+ * Position pointing at the very start of a CDATA block |
|
776 |
+ * "<![CDATA[" |
|
777 |
+ * @param parserData |
|
778 |
+ * @return position from where to continue parsing |
|
779 |
+ */ |
|
780 |
+ private static int parseCDATA(String xml, int pos, TagContentParserData parserData) throws XmlParseException { |
|
781 |
+ boolean correct = xml.startsWith("<![CDATA[", pos); |
|
782 |
+ int res = pos; |
|
783 |
+ |
|
784 |
+ if (correct) { |
|
785 |
+ int cdataEnd = xml.indexOf("]]>", pos); |
|
786 |
+ |
|
787 |
+ correct = cdataEnd != -1; |
|
788 |
+ // We have a correct CDATA block |
|
789 |
+ if (correct) { |
|
790 |
+ String contentBit = xml.substring(pos + "<![CDATA[".length(), cdataEnd); |
|
791 |
+ |
|
792 |
+ parserData.contentBits.add(contentBit); |
|
793 |
+ |
|
794 |
+ res = cdataEnd + "]]>".length(); |
|
795 |
+ } |
|
796 |
+ } |
|
797 |
+ |
|
798 |
+ if (!correct) { |
|
799 |
+ throw new XmlParseException("Error parsing CDATA block", pos); |
|
800 |
+ } |
|
801 |
+ |
|
802 |
+ return res; |
|
803 |
+ } |
|
804 |
+ |
|
805 |
+ private static class StringPair { |
|
806 |
+ public StringPair(String first, String second) { |
|
807 |
+ this.first = first; |
|
808 |
+ this.second = second; |
|
809 |
+ } |
|
810 |
+ |
|
811 |
+ public String first; |
|
812 |
+ public String second; |
|
813 |
+ } |
|
814 |
+ |
|
815 |
+ private static final StringPair[] EscapedEntities = new StringPair[] { new StringPair("<", "<"), |
|
816 |
+ new StringPair(">", ">"), new StringPair("&", "&"), new StringPair("&apos", "'"), |
|
817 |
+ new StringPair(""", "\""), }; |
|
818 |
+ |
|
819 |
+ /** |
|
820 |
+ * Unescape XML literal, that is <, >, &, &apos, " |
|
821 |
+ * |
|
822 |
+ * @param literal |
|
823 |
+ * @return |
|
824 |
+ */ |
|
825 |
+ public static String unescapeXmlLiteral(String literal) { |
|
826 |
+ String res = literal; |
|
827 |
+ |
|
828 |
+ // Look for any escaped entities |
|
829 |
+ for (int i = 0; i < EscapedEntities.length; i++) { |
|
830 |
+ // For each entity replace all of its occurrences |
|
831 |
+ res = Utils.ReplaceStr(res, EscapedEntities[i].first, EscapedEntities[i].second); |
|
832 |
+ } |
|
833 |
+ |
|
834 |
+ return res; |
|
835 |
+ } |
|
836 |
+ |
|
837 |
+ /** |
|
838 |
+ * Escape XML literal, that is <, >, &, &apos, " |
|
839 |
+ * |
|
840 |
+ * @param literal |
|
841 |
+ * @param skip |
|
842 |
+ * List of tokens to skip escaping |
|
843 |
+ * @return escaped literal |
|
844 |
+ */ |
|
845 |
+ public static String escapeXmlLiteral(String literal, String[] skip) { |
|
846 |
+ String res = literal; |
|
847 |
+ TreeSet<String> skipSet = new TreeSet<String>(); |
|
848 |
+ if (skip != null) { |
|
849 |
+ for (int i = 0; i < skip.length; i++) { |
|
850 |
+ skipSet.add(skip[i]); |
|
851 |
+ } |
|
852 |
+ } |
|
853 |
+ |
|
854 |
+ // Look for any escaped entities |
|
855 |
+ for (int i = 0; i < EscapedEntities.length; i++) { |
|
856 |
+ if (!skipSet.contains(EscapedEntities[i].second)) { |
|
857 |
+ // For each entity replace all of its occurrences |
|
858 |
+ res = Utils.ReplaceStr(res, EscapedEntities[i].second, EscapedEntities[i].first); |
|
859 |
+ } |
|
860 |
+ } |
|
861 |
+ |
|
862 |
+ return res; |
|
863 |
+ } |
|
864 |
+ |
|
865 |
+ private static String removeTabsAndLinefeeds(String literal) { |
|
866 |
+ String res = literal; |
|
867 |
+ |
|
868 |
+ res = Utils.ReplaceStr(res, "\n", ""); |
|
869 |
+ res = Utils.ReplaceStr(res, "\t", ""); |
|
870 |
+ res = Utils.ReplaceStr(res, "\r", ""); |
|
871 |
+ |
|
872 |
+ return res; |
|
873 |
+ } |
|
874 |
+ |
|
875 |
+} |
0 | 876 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,39 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+/** |
|
20 |
+ * XML prologue description |
|
21 |
+ */ |
|
22 |
+public class XmlPrologue { |
|
23 |
+ /** |
|
24 |
+ * XML version |
|
25 |
+ */ |
|
26 |
+ public String version; |
|
27 |
+ |
|
28 |
+ /** |
|
29 |
+ * Character encoding |
|
30 |
+ */ |
|
31 |
+ public String encoding; |
|
32 |
+ |
|
33 |
+ @Override |
|
34 |
+ public String toString() { |
|
35 |
+ String res = String.format("<?xml version=\"%s\" encoding=\"%s\"?>", version, encoding); |
|
36 |
+ |
|
37 |
+ return res; |
|
38 |
+ } |
|
39 |
+} |
0 | 40 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,132 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+import java.util.AbstractMap; |
|
20 |
+import java.util.AbstractSequentialList; |
|
21 |
+import java.util.HashMap; |
|
22 |
+import java.util.Iterator; |
|
23 |
+import java.util.LinkedList; |
|
24 |
+import java.util.TreeSet; |
|
25 |
+ |
|
26 |
+/** |
|
27 |
+ * Class that represents an XML tag with its children, content and attributes. |
|
28 |
+ * It doesn't have getter or setter methods for its properties as it is only |
|
29 |
+ * intended to be a simple data container. |
|
30 |
+ */ |
|
31 |
+public class XmlTag { |
|
32 |
+ /** |
|
33 |
+ * Name for this tag |
|
34 |
+ */ |
|
35 |
+ public String name = ""; |
|
36 |
+ |
|
37 |
+ /** |
|
38 |
+ * Dictionary with all the attributes for this tag. |
|
39 |
+ */ |
|
40 |
+ public AbstractMap<String, String> attributes = new HashMap<String, String>(); |
|
41 |
+ |
|
42 |
+ /** |
|
43 |
+ * Ordered list with all the tag children for this tag. |
|
44 |
+ */ |
|
45 |
+ public AbstractSequentialList<XmlTag> children = new LinkedList<XmlTag>(); |
|
46 |
+ |
|
47 |
+ /** |
|
48 |
+ * Tag contents which are not child tags, i.e. <tag>content</tag>. If a tag |
|
49 |
+ * has content and child tags mixed all the content will be aggregated here |
|
50 |
+ * in one unit. |
|
51 |
+ */ |
|
52 |
+ public String content = ""; |
|
53 |
+ |
|
54 |
+ /** |
|
55 |
+ * If this tag is singleton. That is, it doesn't have a closing tag, for |
|
56 |
+ * instance <emptytag /> |
|
57 |
+ */ |
|
58 |
+ public boolean empty = false; |
|
59 |
+ |
|
60 |
+ @Override |
|
61 |
+ public String toString() { |
|
62 |
+ String res; |
|
63 |
+ |
|
64 |
+ // Printing tag header (name and attributes) |
|
65 |
+ String attrStr = ""; |
|
66 |
+ Iterator<String> keyIter = attributes.keySet().iterator(); |
|
67 |
+ |
|
68 |
+ // Sorting attributes (so its easier to writes tests) |
|
69 |
+ TreeSet<String> treeSet = new TreeSet<String>(); |
|
70 |
+ while (keyIter.hasNext()) { |
|
71 |
+ treeSet.add(keyIter.next()); |
|
72 |
+ } |
|
73 |
+ |
|
74 |
+ // Rendering the attributes |
|
75 |
+ Iterator<String> sortedKeyIter = treeSet.iterator(); |
|
76 |
+ while (sortedKeyIter.hasNext()) { |
|
77 |
+ String key = XmlParser.escapeXmlLiteral(sortedKeyIter.next(), null); |
|
78 |
+ String value = XmlParser.escapeXmlLiteral(attributes.get(key), new String[] { "\"", "'" }); |
|
79 |
+ |
|
80 |
+ boolean valueHasDoubleQuotes = value.indexOf("\"") != -1; |
|
81 |
+ String attrFormat = valueHasDoubleQuotes ? "%s='%s' " : "%s=\"%s\" "; |
|
82 |
+ attrStr += String.format(attrFormat, key, value); |
|
83 |
+ } |
|
84 |
+ |
|
85 |
+ String headerFormat = empty ? "<%s %s/>" : "<%s %s>"; |
|
86 |
+ res = String.format(headerFormat, XmlParser.escapeXmlLiteral(name, null), attrStr); |
|
87 |
+ |
|
88 |
+ // if the header is not empty print its children and ending tag |
|
89 |
+ if (!empty) { |
|
90 |
+ String childrenStr = ""; |
|
91 |
+ |
|
92 |
+ Iterator<XmlTag> childIter = children.iterator(); |
|
93 |
+ |
|
94 |
+ while (childIter.hasNext()) { |
|
95 |
+ XmlTag child = childIter.next(); |
|
96 |
+ |
|
97 |
+ childrenStr += child.toString(); |
|
98 |
+ } |
|
99 |
+ |
|
100 |
+ // Checking if the content has any char that needs to be inside a |
|
101 |
+ // CDATA block |
|
102 |
+ String procContent = contentHasSpecialChars() ? String.format("<![CDATA[%s]]>", content) : content; |
|
103 |
+ |
|
104 |
+ // Adding the ending tag |
|
105 |
+ res = String.format("%s%s%s</%s>", res, childrenStr, procContent, XmlParser.escapeXmlLiteral(name, null)); |
|
106 |
+ } |
|
107 |
+ |
|
108 |
+ return res; |
|
109 |
+ } |
|
110 |
+ |
|
111 |
+ /** |
|
112 |
+ * This is the list of special characters, this list might not be exhausted, |
|
113 |
+ * but my use of XML printing is mainly for testing purposes. |
|
114 |
+ */ |
|
115 |
+ private static char[] SpecialChars = new char[] { '\n', '\r', '\t', '<', '>', '&', '\'', '"', }; |
|
116 |
+ |
|
117 |
+ /** |
|
118 |
+ * @return The content of this tag has special characters and should be |
|
119 |
+ * printed enclosed in a CDATA block |
|
120 |
+ */ |
|
121 |
+ private boolean contentHasSpecialChars() { |
|
122 |
+ boolean res = false; |
|
123 |
+ for (int i = 0; i < SpecialChars.length && !res; i++) { |
|
124 |
+ res = content.indexOf(SpecialChars[i]) != -1; |
|
125 |
+ } |
|
126 |
+ |
|
127 |
+ // Check if it has initial or final spaces |
|
128 |
+ res = res || !content.trim().equals(content); |
|
129 |
+ |
|
130 |
+ return res; |
|
131 |
+ } |
|
132 |
+} |
0 | 133 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,60 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+import junit.framework.TestCase; |
|
20 |
+ |
|
21 |
+/** |
|
22 |
+ * Utilities unit tests |
|
23 |
+ */ |
|
24 |
+public class UtilsTest extends TestCase { |
|
25 |
+ |
|
26 |
+ /** |
|
27 |
+ * Test replacing all occurrences of a single character. |
|
28 |
+ */ |
|
29 |
+ public void testReplaceStr() { |
|
30 |
+ String toReplace = "b"; |
|
31 |
+ String replacement = "c"; |
|
32 |
+ |
|
33 |
+ String[] inputs = new String[] { "aaaa", "bbbb", "aaabbaa" }; |
|
34 |
+ |
|
35 |
+ String[] outputs = new String[] { "aaaa", "cccc", "aaaccaa" }; |
|
36 |
+ |
|
37 |
+ for (int i = 0; i < inputs.length; i++) { |
|
38 |
+ String res = Utils.ReplaceStr(inputs[i], toReplace, replacement); |
|
39 |
+ assertEquals(outputs[i], res); |
|
40 |
+ } |
|
41 |
+ } |
|
42 |
+ |
|
43 |
+ /** |
|
44 |
+ * Test replacing all occurrences of a string longer than one character. |
|
45 |
+ */ |
|
46 |
+ public void testReplaceStrMulti() { |
|
47 |
+ String toReplace = "bb"; |
|
48 |
+ String replacement = "c"; |
|
49 |
+ |
|
50 |
+ String[] inputs = new String[] { "aaaa", "bbbb", "aaabbaa" }; |
|
51 |
+ |
|
52 |
+ String[] outputs = new String[] { "aaaa", "cc", "aaacaa" }; |
|
53 |
+ |
|
54 |
+ for (int i = 0; i < inputs.length; i++) { |
|
55 |
+ String res = Utils.ReplaceStr(inputs[i], toReplace, replacement); |
|
56 |
+ assertEquals(outputs[i], res); |
|
57 |
+ } |
|
58 |
+ } |
|
59 |
+ |
|
60 |
+} |
0 | 61 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,241 @@ |
1 |
+/** |
|
2 |
+ * Copyright Antoni Silvestre |
|
3 |
+ * |
|
4 |
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
|
5 |
+ * use this file except in compliance with the License. You may obtain a copy of |
|
6 |
+ * the License at |
|
7 |
+ * |
|
8 |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 |
+ * |
|
10 |
+ * Unless required by applicable law or agreed to in writing, software |
|
11 |
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
12 |
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
13 |
+ * License for the specific language governing permissions and limitations under |
|
14 |
+ * the License. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+package com.github.asilvestre.jpurexml; |
|
18 |
+ |
|
19 |
+import junit.framework.TestCase; |
|
20 |
+ |
|
21 |
+/** |
|
22 |
+ * Parses the main XML parsing functionality |
|
23 |
+ */ |
|
24 |
+public class XmlParserTest extends TestCase { |
|
25 |
+ |
|
26 |
+ /** |
|
27 |
+ * Test we parse the prologue correctly |
|
28 |
+ */ |
|
29 |
+ public void testParsePrologue() { |
|
30 |
+ String[] inputs = new String[] { "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root/>", |
|
31 |
+ "<?xml version='1.0' encoding=\"UTF-8\"?><root/>", |
|
32 |
+ "<?xml version='2.0' encoding=\"UTF-8\"?>< root />", |
|
33 |
+ "<?xml encoding = \"UTF-8\" version='1.0' ?><root/>", |
|
34 |
+ "<?xml a='r' encoding = \"UTF-8\" version='1.0' ?><root/>", }; |
|
35 |
+ |
|
36 |
+ String[] outputs = new String[] { "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root />", |
|
37 |
+ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root />", |
|
38 |
+ "<?xml version=\"2.0\" encoding=\"UTF-8\"?><root />", |
|
39 |
+ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root />", |
|
40 |
+ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root />", }; |
|