1
28 package demo.layout.genealogy.iohandler;
29
30 import java.io.BufferedReader;
31 import java.io.ByteArrayOutputStream;
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.io.InputStreamReader;
35 import java.util.ArrayList;
36 import java.util.List;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40
48 public class GedcomInputParser {
49 public static final String ENCODING_ANSEL = "ANSEL";
50 public static final String ENCODING_UTF_8 = "UTF-8";
51 public static final String ENCODING_UTF_16LE = "UTF-16LE";
52 public static final String ENCODING_UTF_16BE = "UTF-16BE";
53 public static final String ENCODING_ASCII = "ASCII";
54 private static final String ENCODING_ANSI = "ANSI";
55
56 static final String GEDCOM_ENCODING_ERROR_MESSAGE =
57 "GEDCOM - invalid encoding: The encoding of the file should be UTF-8, UTF-16BE, UTF-16LE, ANSEL, ASCII, or ANSI.";
58
59
68 public void parse(InputStream inStream, GedcomInputHandler handler) throws IOException {
69 ByteArrayOutputStream baos = new ByteArrayOutputStream();
70 String encoding = readEncoding(inStream, baos);
71 ResettingInputStream in = new ResettingInputStream(inStream, baos.toByteArray());
72
73 if (encoding.equals(ENCODING_ANSEL)) {
74 parseLines(new AnselInputStreamReader(in), handler);
75 } else if (encoding.equals(ENCODING_UTF_8)) {
76 parseLines(new InputStreamReader(in, "UTF-8"), handler);
77 } else if (encoding.equals(ENCODING_UTF_16LE)) {
78 parseLines(new InputStreamReader(in, "UTF-16LE"), handler);
79 } else if (encoding.equals(ENCODING_UTF_16BE)) {
80 parseLines(new InputStreamReader(in, "UTF-16BE"), handler);
81 } else if (encoding.equals(ENCODING_ASCII)) {
82 parseLines(new InputStreamReader(in, "ASCII"), handler);
83 } else if (encoding.equals(ENCODING_ANSI)) {
84 parseLines(new InputStreamReader(in, "windows-1252"), handler);
85 } else {
86 throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
87 }
88 }
89
90
101 static String readEncoding(InputStream inStream, ByteArrayOutputStream baos) throws IOException {
102 String encoding = null;
103 int firstByte = inStream.read();
104 if(firstByte < 0) {
105 throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
106 }
107 int secondByte = inStream.read();
108 if(secondByte < 0) {
109 throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
110 }
111
112 if (firstByte == 0xef && secondByte == 0xbb) {
113 inStream.read();
114 encoding = ENCODING_UTF_8;
115 } else if (firstByte == 0xfe && secondByte == 0xff) {
116 encoding = ENCODING_UTF_16BE;
117 } else if (firstByte == 0xff && secondByte == 0xfe) {
118 encoding = ENCODING_UTF_16LE;
119 } else if (firstByte == 0x00 && secondByte == 0x30) {
120 baos.write(firstByte);
121 baos.write(secondByte);
122 encoding = ENCODING_UTF_16BE;
123 } else if (firstByte == 0x30 && secondByte == 0x00) {
124 baos.write(firstByte);
125 baos.write(secondByte);
126 encoding = ENCODING_UTF_16LE;
127 } else {
128 String line;
132 baos.write(firstByte);
133 baos.write(secondByte);
134 while ((line = readLine(inStream, baos)) != null) {
135 if (line.startsWith("1 CHAR")) {
136 encoding = line.trim().substring(line.lastIndexOf(" ") + 1);
137 } else if (line.startsWith("0 ")) {
138 break;
146 }
147 }
148 }
149 if (encoding == null) {
150 throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
151 }
152 return encoding;
153 }
154
155
163 static String readLine(InputStream inStream, ByteArrayOutputStream baos) throws IOException {
164 final int lastByte = baos.toByteArray()[baos.size() - 1];
165 int nextByte = readNextByte(inStream, baos);
166 if (nextByte == -1) {
167 return null;
168 }
169 ByteArrayOutputStream line = new ByteArrayOutputStream();
170 if (nextByte == '\n' || nextByte == '\r') {
171 if (nextByte != lastByte) {
172 nextByte = readNextByte(inStream, baos);
173 } else {
174 throw new IOException(
175 "GEDCOM - invalid line terminator: All lines in the file should end with \\n, \\r, \\n\\r or \\r\\n.");
176 }
177 }
178 while (nextByte != '\n' && nextByte != '\r' && nextByte != -1) {
179 line.write(nextByte);
180 nextByte = readNextByte(inStream, baos);
181 }
182 return line.toString();
183 }
184
185
193 static int readNextByte(InputStream inStream, ByteArrayOutputStream baos) throws IOException {
194 int nextByte = inStream.read();
195 if (nextByte > -1) {
196 baos.write(nextByte);
197 }
198 return nextByte;
199 }
200
201
209 static void parseLines(InputStreamReader reader, GedcomInputHandler handler) throws IOException {
210 final ParseContext parseContext = new ParseContext(handler);
211 BufferedReader buffer = new BufferedReader(reader);
212 int level;
213 String id, tag, value;
214
215 String line = buffer.readLine();
217 if (!"0 HEAD".equals(line)) {
218 throw new IOException("GEDCOM - missing header: The file must start with \"0 HEAD\"");
219 }
220
221 parseContext.handleStartDocument();
222
223 Pattern gedcomLinePattern = Pattern.compile("([0-9]{1,3}) ((@\\w[ \\p{Graph}&&[^%@]]*@) )?(_?[A-Z0-9]{2,4}) ?(.*)?");
225 Matcher matcher;
226
227 while (line != null) {
228 ++parseContext.line;
229
230 matcher = gedcomLinePattern.matcher(line);
232 if (matcher.matches()) {
233 level = Integer.parseInt(matcher.group(1));
234 id = matcher.group(3);
235 tag = matcher.group(4);
236 value = matcher.group(5);
237
238 line = buffer.readLine();
239
240 if ("TRLR".equals(tag)) {
241 line = null;
242 }
243
244 handleLine(parseContext, level, id, tag, value, line);
245 } else {
246 throw new IOException("GEDCOM - Invalid format at line " + parseContext.line);
247 }
248 }
249
250 if (parseContext.size() > 0) {
251 throw new IOException("GEDCOM - Missing end tag: The file must end with \"0 TRLR\"");
252 }
253 parseContext.handleEndDocument();
254 }
255
256
268 static void handleLine(ParseContext context, int level, String id, String tag, String value, String peek)
269 throws IOException {
270 if (!"CONT".equals(tag) && !"CONC".equals(tag)) {
271 context.push(tag);
272 context.level = level;
273 context.id = id;
274 context.value = new StringBuffer(50);
275 }
276
277 if ("CONT".equals(tag)) {
278 context.value.append('\n');
279 }
280 if (value != null) {
281 context.value.append(value);
282 }
283
284 if (peek != null) {
285 final int currentLevel = context.size() - 1;
286 if (!peek.startsWith((currentLevel + 1) + " CONT") && !peek.startsWith((currentLevel + 1) + " CONC")
287 && peek.indexOf(" ") >= 0) {
288 try {
289 int nextLevel = Integer.parseInt(peek.substring(0, peek.indexOf(" ")));
290 if (nextLevel > currentLevel) {
291 if (nextLevel == currentLevel + 1) {
292 context.handleStartTag(context.level, context.id, context.peek(), context.value.toString());
293 } else {
294 throw new IOException("GEDCOM - Invalid nesting at line " + context.line);
295 }
296 } else {
297 context.handleStartTag(context.level, context.id, context.peek(), context.value.toString());
298 context.handleEndTag(context.size() - 1, context.pop());
299 for (int i = nextLevel; i < currentLevel; i++) {
300 context.handleEndTag(context.size() - 1, context.pop());
301 }
302 }
303 } catch (NumberFormatException e) {
304 throw new IOException("GEDCOM - Invalid format at line " + (context.line + 1));
305 }
306 }
307 } else {
308 context.handleStartTag(context.level, context.id, context.peek(), context.value.toString());
310 context.handleEndTag(context.size() - 1, context.pop());
311 }
312 }
313
314
318 static final class ParseContext implements GedcomInputHandler {
319 final GedcomInputHandler handler;
320 final List stack;
321 int line;
322 int level;
323 String id;
324 StringBuffer value;
325
326 ParseContext(GedcomInputHandler handler) {
327 if (handler == null) {
328 this.handler = new GedcomInputHandler() {
329 public void handleStartDocument() {
330 }
331
332 public void handleEndDocument() {
333 }
334
335 public void handleStartTag(int level, String id, String tag, String value) {
336 }
337
338 public void handleEndTag(int level, String tag) {
339 }
340 };
341 } else {
342 this.handler = handler;
343 }
344 stack = new ArrayList();
345 }
346
347 public void handleEndDocument() {
348 handler.handleEndDocument();
349 }
350
351 public void handleEndTag(int level, String tag) {
352 handler.handleEndTag(level, tag);
353 }
354
355 public void handleStartDocument() {
356 handler.handleStartDocument();
357 }
358
359 public void handleStartTag(int level, String id, String tag, String value) {
360 handler.handleStartTag(level, id, tag, value);
361 }
362
363 public void push(String tag) {
364 stack.add(tag);
365 }
366
367 public String pop() {
368 return (String) stack.remove(stack.size() - 1);
369 }
370
371 public String peek() {
372 return (String) stack.get(stack.size() - 1);
373 }
374
375 public int size() {
376 return stack.size();
377 }
378 }
379
380
386 static class ResettingInputStream extends InputStream {
387 private final InputStream inStream;
388 private final byte[] buffer;
389 private int pos;
390
391 ResettingInputStream(InputStream in, byte[] firstBytes) {
392 inStream = in;
393 buffer = firstBytes;
394 pos = 0;
395 }
396
397 public int read() throws IOException {
398 if (pos < buffer.length) {
399 return buffer[pos++] & 0xFF;
400 } else {
401 return inStream.read();
402 }
403 }
404
405 public int read(byte[] b, int off, int len) throws IOException {
406 if (pos < buffer.length) {
407 final int amount = Math.min(len, buffer.length - pos);
408 System.arraycopy(buffer, pos, b, off, amount);
409 pos += amount;
410 return amount;
411 } else {
412 return inStream.read(b, off, len);
413 }
414 }
415 }
416 }
417