1   /****************************************************************************
2    * This demo file is part of yFiles for Java 2.14.
3    * Copyright (c) 2000-2017 by yWorks GmbH, Vor dem Kreuzberg 28,
4    * 72070 Tuebingen, Germany. All rights reserved.
5    * 
6    * yFiles demo files exhibit yFiles for Java functionalities. Any redistribution
7    * of demo files in source code or binary form, with or without
8    * modification, is not permitted.
9    * 
10   * Owners of a valid software license for a yFiles for Java version that this
11   * demo is shipped with are allowed to use the demo source code as basis
12   * for their own yFiles for Java powered applications. Use of such programs is
13   * governed by the rights and conditions as set out in the yFiles for Java
14   * license agreement.
15   * 
16   * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY EXPRESS OR IMPLIED
17   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18   * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
19   * NO EVENT SHALL yWorks BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21   * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22   * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23   * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24   * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26   *
27   ***************************************************************************/
28  package demo.layout.genealogy.iohandler;
29  
30  import java.io.BufferedReader;
31  import java.io.ByteArrayOutputStream;
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.io.InputStreamReader;
35  import java.util.ArrayList;
36  import java.util.List;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  /**
41   * A parser that reads GEDCOM files into a graph.
42   * <p/>
43   * This implementation extracts the hierarchical structure and the data values of each line from the GEDCOM file. Then,
44   * it passes the information to a handler which builds the graph from the information in the preprocessed lines.
45   * <p/>
46   * There are several encodings supported that are specified in the GEDCOM standard: UTF8, UTF16, ANSEL, ASCII and ANSI.
47   */
48  public class GedcomInputParser {
49    public static final String ENCODING_ANSEL = "ANSEL";
50    public static final String ENCODING_UTF_8 = "UTF-8";
51    public static final String ENCODING_UTF_16LE = "UTF-16LE";
52    public static final String ENCODING_UTF_16BE = "UTF-16BE";
53    public static final String ENCODING_ASCII = "ASCII";
54    private static final String ENCODING_ANSI = "ANSI";
55  
56    static final String GEDCOM_ENCODING_ERROR_MESSAGE =
57        "GEDCOM - invalid encoding: The encoding of the file should be UTF-8, UTF-16BE, UTF-16LE, ANSEL, ASCII, or ANSI.";
58  
59    /**
60     * Extracts the information of each line of the GEDCOM file and passes it to the given handler.
61     * <p/>
62     * Before parsing the encoding of the file is detected.
63     *
64     * @param inStream the <code>InputStream</code> from the GEDCOM file
65     * @param handler  the handler which uses the extracted information to build a graph
66     * @throws IOException if the stream is not readable
67     */
68    public void parse(InputStream inStream, GedcomInputHandler handler) throws IOException {
69      ByteArrayOutputStream baos = new ByteArrayOutputStream();
70      String encoding = readEncoding(inStream, baos);
71      ResettingInputStream in = new ResettingInputStream(inStream, baos.toByteArray());
72  
73      if (encoding.equals(ENCODING_ANSEL)) {
74        parseLines(new AnselInputStreamReader(in), handler);
75      } else if (encoding.equals(ENCODING_UTF_8)) {
76        parseLines(new InputStreamReader(in, "UTF-8"), handler);
77      } else if (encoding.equals(ENCODING_UTF_16LE)) {
78        parseLines(new InputStreamReader(in, "UTF-16LE"), handler);
79      } else if (encoding.equals(ENCODING_UTF_16BE)) {
80        parseLines(new InputStreamReader(in, "UTF-16BE"), handler);
81      } else if (encoding.equals(ENCODING_ASCII)) {
82        parseLines(new InputStreamReader(in, "ASCII"), handler);
83      } else if (encoding.equals(ENCODING_ANSI)) {
84        parseLines(new InputStreamReader(in, "windows-1252"), handler);
85      } else {
86        throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
87      }
88    }
89  
90    /**
91     * Detects the encoding of the GEDCOM file.
92     * <p/>
93     * While searching for the encoding references, the bytes that are already read from the stream are stored in an
94     * <code>ByteArrayOutputStream</code>. So these bytes are still available for reading.
95     *
96     * @param inStream the stream from the GEDCOM file
97     * @param baos     the storage for bytes that are already read
98     * @return an identifier for the files encoding
99     * @throws IOException if the file does not fit the GEDCOM standard
100    */
101   static String readEncoding(InputStream inStream, ByteArrayOutputStream baos) throws IOException {
102     String encoding = null;
103     int firstByte = inStream.read();
104     if(firstByte < 0) {
105       throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
106     }
107     int secondByte = inStream.read();
108     if(secondByte < 0) {
109       throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
110     }
111 
112     if (firstByte == 0xef && secondByte == 0xbb) {
113       inStream.read();
114       encoding = ENCODING_UTF_8;
115     } else if (firstByte == 0xfe && secondByte == 0xff) {
116       encoding = ENCODING_UTF_16BE;
117     } else if (firstByte == 0xff && secondByte == 0xfe) {
118       encoding = ENCODING_UTF_16LE;
119     } else if (firstByte == 0x00 && secondByte == 0x30) {
120       baos.write(firstByte);
121       baos.write(secondByte);
122       encoding = ENCODING_UTF_16BE;
123     } else if (firstByte == 0x30 && secondByte == 0x00) {
124       baos.write(firstByte);
125       baos.write(secondByte);
126       encoding = ENCODING_UTF_16LE;
127     } else {
128       // there is no byte order mark
129       // according to the specification, the first line has to be "0 HEAD"
130       // this means firstByte corresponds to "0" and secondByte to " " 
131       String line;
132       baos.write(firstByte);
133       baos.write(secondByte);
134       while ((line = readLine(inStream, baos)) != null) {
135         if (line.startsWith("1 CHAR")) {
136           encoding = line.trim().substring(line.lastIndexOf(" ") + 1);
137         } else if (line.startsWith("0 ")) {
138           // since firstByte and secondByte stripped "0 " away from the first
139           // line, the next occurrence of the "0 " prefix signals the start
140           // of the first non-header top-level section and thus the end of
141           // the header section
142           // according to the specification, the CHAR directive has to be
143           // in the header section, so it is ok to stop looking for the CHAR
144           // directive now
145           break;
146         }
147       }
148     }
149     if (encoding == null) {
150       throw new IOException(GEDCOM_ENCODING_ERROR_MESSAGE);
151     }
152     return encoding;
153   }
154 
155   /**
156    * Reads a line from a file byte-wise and stores the read bytes.
157    *
158    * @param inStream the stream from the GEDCOM file
159    * @param baos     the storage for bytes that are already read
160    * @return the read line
161    * @throws IOException if the file does not fit the GEDCOM standard
162    */
163   static String readLine(InputStream inStream, ByteArrayOutputStream baos) throws IOException {
164     final int lastByte = baos.toByteArray()[baos.size() - 1];
165     int nextByte = readNextByte(inStream, baos);
166     if (nextByte == -1) {
167       return null;
168     }
169     ByteArrayOutputStream line = new ByteArrayOutputStream();
170     if (nextByte == '\n' || nextByte == '\r') {
171       if (nextByte != lastByte) {
172         nextByte = readNextByte(inStream, baos);
173       } else {
174         throw new IOException(
175             "GEDCOM - invalid line terminator: All lines in the file should end with \\n, \\r, \\n\\r or \\r\\n.");
176       }
177     }
178     while (nextByte != '\n' && nextByte != '\r' && nextByte != -1) {
179       line.write(nextByte);
180       nextByte = readNextByte(inStream, baos);
181     }
182     return line.toString();
183   }
184 
185   /**
186    * Reads a byte from a file and stores it in an <code>ByteArrayOutputStream</code>.
187    *
188    * @param inStream the stream from the GEDCOM file
189    * @param baos     the storage for bytes that are already read
190    * @return the read byte
191    * @throws IOException if the file does not fit the GEDCOM standard
192    */
193   static int readNextByte(InputStream inStream, ByteArrayOutputStream baos) throws IOException {
194     int nextByte = inStream.read();
195     if (nextByte > -1) {
196       baos.write(nextByte);
197     }
198     return nextByte;
199   }
200 
201   /**
202    * Extracts the fields level, id, tag and value from each line of the GEDCOM file and passes these fields to a handler
203    * that fills them into a graph structure.
204    *
205    * @param reader  the stream from the GEDCOM file
206    * @param handler a handler that builds the graph
207    * @throws IOException if the file does not fit the GEDCOM standard
208    */
209   static void parseLines(InputStreamReader reader, GedcomInputHandler handler) throws IOException {
210     final ParseContext parseContext = new ParseContext(handler);
211     BufferedReader buffer = new BufferedReader(reader);
212     int level;
213     String id, tag, value;
214 
215     // gedcom file must start with "0 HEAD"
216     String line = buffer.readLine();
217     if (!"0 HEAD".equals(line)) {
218       throw new IOException("GEDCOM - missing header: The file must start with \"0 HEAD\"");
219     }
220 
221     parseContext.handleStartDocument();
222 
223     // gedcom line: LevelNumber [ID ] Tag[ LineValue]
224     Pattern gedcomLinePattern = Pattern.compile("([0-9]{1,3}) ((@\\w[ \\p{Graph}&&[^%@]]*@) )?(_?[A-Z0-9]{2,4}) ?(.*)?");
225     Matcher matcher;
226 
227     while (line != null) {
228       ++parseContext.line;
229 
230       //if line is a gedcom line
231       matcher = gedcomLinePattern.matcher(line);
232       if (matcher.matches()) {
233         level = Integer.parseInt(matcher.group(1));
234         id = matcher.group(3);
235         tag = matcher.group(4);
236         value = matcher.group(5);
237 
238         line = buffer.readLine();
239 
240         if ("TRLR".equals(tag)) {
241           line = null;
242         }
243 
244         handleLine(parseContext, level, id, tag, value, line);
245       } else {
246         throw new IOException("GEDCOM - Invalid format at line " + parseContext.line);
247       }
248     }
249 
250     if (parseContext.size() > 0) {
251       throw new IOException("GEDCOM - Missing end tag: The file must end with \"0 TRLR\"");
252     }
253     parseContext.handleEndDocument();
254   }
255 
256   /**
257    * Handles the hierarchic structure of the GEDCOM file and passes the lines to the handler that builds the graph.
258    * <p/>
259    * If the value of the GEDCOM line ranges over several lines, these lines are collected and passed as one line.
260    *
261    * @param context the parse context where the hierarchy of the tags and some intermediary result are stored
262    * @param id      the id field of the GEDCOM line (might be <code>null</code>)
263    * @param tag     the tag field of the GEDCOM line
264    * @param value   the value field of the GEDCOM line (might be <code>null</code>)
265    * @param peek    the next line in the file
266    * @throws IOException if the file does not fit the GEDCOM standard
267    */
268   static void handleLine(ParseContext context, int level, String id, String tag, String value, String peek)
269       throws IOException {
270     if (!"CONT".equals(tag) && !"CONC".equals(tag)) {
271       context.push(tag);
272       context.level = level;
273       context.id = id;
274       context.value = new StringBuffer(50);
275     }
276 
277     if ("CONT".equals(tag)) {
278       context.value.append('\n');
279     }
280     if (value != null) {
281       context.value.append(value);
282     }
283 
284     if (peek != null) {
285       final int currentLevel = context.size() - 1;
286       if (!peek.startsWith((currentLevel + 1) + " CONT") && !peek.startsWith((currentLevel + 1) + " CONC")
287           && peek.indexOf(" ") >= 0) {
288         try {
289           int nextLevel = Integer.parseInt(peek.substring(0, peek.indexOf(" ")));
290           if (nextLevel > currentLevel) {
291             if (nextLevel == currentLevel + 1) {
292               context.handleStartTag(context.level, context.id, context.peek(), context.value.toString());
293             } else {
294               throw new IOException("GEDCOM - Invalid nesting at line " + context.line);
295             }
296           } else {
297             context.handleStartTag(context.level, context.id, context.peek(), context.value.toString());
298             context.handleEndTag(context.size() - 1, context.pop());
299             for (int i = nextLevel; i < currentLevel; i++) {
300               context.handleEndTag(context.size() - 1, context.pop());
301             }
302           }
303         } catch (NumberFormatException e) {
304           throw new IOException("GEDCOM - Invalid format at line " + (context.line + 1));
305         }
306       }
307     } else {
308       // stream end -> handle last line (0 TRLR)
309       context.handleStartTag(context.level, context.id, context.peek(), context.value.toString());
310       context.handleEndTag(context.size() - 1, context.pop());
311     }
312   }
313 
314   /**
315    * Delegates all handler calls to a given handler and additionally provides a storage for the hierarchic information
316    * of the GEDCOM file.
317    */
318   static final class ParseContext implements GedcomInputHandler {
319     final GedcomInputHandler handler;
320     final List stack;
321     int line;
322     int level;
323     String id;
324     StringBuffer value;
325 
326     ParseContext(GedcomInputHandler handler) {
327       if (handler == null) {
328         this.handler = new GedcomInputHandler() {
329           public void handleStartDocument() {
330           }
331 
332           public void handleEndDocument() {
333           }
334 
335           public void handleStartTag(int level, String id, String tag, String value) {
336           }
337 
338           public void handleEndTag(int level, String tag) {
339           }
340         };
341       } else {
342         this.handler = handler;
343       }
344       stack = new ArrayList();
345     }
346 
347     public void handleEndDocument() {
348       handler.handleEndDocument();
349     }
350 
351     public void handleEndTag(int level, String tag) {
352       handler.handleEndTag(level, tag);
353     }
354 
355     public void handleStartDocument() {
356       handler.handleStartDocument();
357     }
358 
359     public void handleStartTag(int level, String id, String tag, String value) {
360       handler.handleStartTag(level, id, tag, value);
361     }
362 
363     public void push(String tag) {
364       stack.add(tag);
365     }
366 
367     public String pop() {
368       return (String) stack.remove(stack.size() - 1);
369     }
370 
371     public String peek() {
372       return (String) stack.get(stack.size() - 1);
373     }
374 
375     public int size() {
376       return stack.size();
377     }
378   }
379 
380   /**
381    * Reads bytes from an array and then continues with the byte from a context <code>InputStream</code>.
382    * <p/>
383    * This class can be used to find a specific piece of information in a file (e.g. the encoding) and then return to the
384    * beginning to read the whole file.
385    */
386   static class ResettingInputStream extends InputStream {
387     private final InputStream inStream;
388     private final byte[] buffer;
389     private int pos;
390 
391     ResettingInputStream(InputStream in, byte[] firstBytes) {
392       inStream = in;
393       buffer = firstBytes;
394       pos = 0;
395     }
396 
397     public int read() throws IOException {
398       if (pos < buffer.length) {
399         return buffer[pos++] & 0xFF;
400       } else {
401         return inStream.read();
402       }
403     }
404 
405     public int read(byte[] b, int off, int len) throws IOException {
406       if (pos < buffer.length) {
407         final int amount = Math.min(len, buffer.length - pos);
408         System.arraycopy(buffer, pos, b, off, amount);
409         pos += amount;
410         return amount;
411       } else {
412         return inStream.read(b, off, len);
413       }
414     }
415   }
416 }
417