View Javadoc

1   package org.apache.turbine.util.parser;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.BufferedReader;
23  import java.io.IOException;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StreamTokenizer;
27  
28  import java.util.ArrayList;
29  import java.util.Collections;
30  import java.util.Iterator;
31  import java.util.List;
32  import java.util.NoSuchElementException;
33  
34  import org.apache.commons.lang.exception.NestableRuntimeException;
35  
36  /***
37   * DataStreamParser is used to parse a stream with a fixed format and
38   * generate ValueParser objects which can be used to extract the values
39   * in the desired type.
40   *
41   * <p>The class itself is abstract - a concrete subclass which implements
42   * the initTokenizer method such as CSVParser or TSVParser is required
43   * to use the functionality.
44   *
45   * <p>The class implements the java.util.Iterator interface for convenience.
46   * This allows simple use in a Velocity template for example:
47   *
48   * <pre>
49   * #foreach ($row in $datastream)
50   *   Name: $row.Name
51   *   Description: $row.Description
52   * #end
53   * </pre>
54   *
55   * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
56   * @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
57   * @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
58   * @version $Id: DataStreamParser.java 534527 2007-05-02 16:10:59Z tv $
59   */
60  public abstract class DataStreamParser implements Iterator
61  {
62      /***
63       * The constant for empty fields
64       */
65      protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";
66  
67      /***
68       * The list of column names.
69       */
70      private List columnNames = Collections.EMPTY_LIST;
71  
72      /***
73       * The stream tokenizer for reading values from the input reader.
74       */
75      private StreamTokenizer tokenizer;
76  
77      /***
78       * The parameter parser holding the values of columns for the current line.
79       */
80      private ValueParser lineValues;
81  
82      /***
83       * Indicates whether or not the tokenizer has read anything yet.
84       */
85      private boolean neverRead = true;
86  
87      /***
88       * The character encoding of the input
89       */
90      private String characterEncoding;
91  
92      /***
93       * The fieldseperator, which can be almost any char
94       */
95      private char fieldSeparator;
96  
97      /***
98       * Create a new DataStreamParser instance. Requires a Reader to read the
99       * comma-separated values from, a list of column names and a
100      * character encoding.
101      *
102      * @param in the input reader.
103      * @param columnNames a list of column names.
104      * @param characterEncoding the character encoding of the input.
105      */
106     public DataStreamParser(Reader in, List columnNames,
107                             String characterEncoding)
108     {
109         setColumnNames(columnNames);
110 
111         this.characterEncoding = characterEncoding;
112 
113         if (this.characterEncoding == null)
114         {
115             if (in instanceof InputStreamReader)
116             {
117                 this.characterEncoding = ((InputStreamReader) in).getEncoding();
118             }
119 
120             if (this.characterEncoding == null)
121             {
122                 // try and get the characterEncoding from the reader
123                 this.characterEncoding = "US-ASCII";
124             }
125         }
126 
127         tokenizer = new StreamTokenizer(new BufferedReader(in));
128         initTokenizer(tokenizer);
129     }
130 
131     /***
132      * Initialize the StreamTokenizer instance used to read the lines
133      * from the input reader. This must be implemented in subclasses to
134      * set up other tokenizing properties.
135      *
136      * @param tokenizer the tokenizer to adjust
137      */
138     protected void initTokenizer(StreamTokenizer tokenizer)
139     {
140         tokenizer.resetSyntax();
141 
142         // leave out the comma sign (,), we need it for empty fields
143         tokenizer.wordChars(' ', Character.MAX_VALUE);
144 
145         // and  set the quote mark as the quoting character
146         tokenizer.quoteChar('"');
147 
148         // and finally say that end of line is significant
149         tokenizer.eolIsSignificant(true);
150     }
151 
152     /***
153      * This method must be called to setup the field seperator
154      * @param fieldSeparator the char which separates the fields
155      */
156     public void setFieldSeparator(char fieldSeparator)
157     {
158         this.fieldSeparator = fieldSeparator;
159         // make this field also an ordinary char by default.
160         tokenizer.ordinaryChar(fieldSeparator);
161     }
162 
163     /***
164      * Set the list of column names explicitly.
165      *
166      * @param columnNames A list of column names.
167      */
168     public void setColumnNames(List columnNames)
169     {
170         if (columnNames != null)
171         {
172             this.columnNames = columnNames;
173         }
174     }
175 
176     /***
177      * get the list of column names.
178      *
179      */
180     public List getColumnNames()
181     {
182         return columnNames;
183     }
184 
185     /***
186      * Read the list of column names from the input reader using the
187      * tokenizer. If fieldNames are empty, we use the current fieldNumber
188      * + the EMPTYFIELDNAME to make one up.
189      *
190      * @exception IOException an IOException occurred.
191      */
192     public void readColumnNames()
193             throws IOException
194     {
195         List columnNames = new ArrayList();
196         int fieldCounter = 0;
197 
198         if (hasNextRow())
199         {
200             String colName = null;
201             boolean foundEol = false;
202 
203             while(!foundEol)
204             {
205                 tokenizer.nextToken();
206 
207                 if (tokenizer.ttype == '"'
208                         || tokenizer.ttype == StreamTokenizer.TT_WORD)
209                 {
210                     // tokenizer.ttype is either '"' or TT_WORD
211                     colName = tokenizer.sval;
212                 }
213                 else
214                 {
215                     // fieldSeparator, EOL or EOF
216                     fieldCounter++;
217 
218                     if (colName == null)
219                     {
220                         colName = EMPTYFIELDNAME + fieldCounter;
221                     }
222 
223                     columnNames.add(colName);
224                     colName = null;
225                 }
226 
227                 // EOL and EOF are checked independently from existing fields.
228                 if (tokenizer.ttype == StreamTokenizer.TT_EOL)
229                 {
230                     foundEol = true;
231                 }
232                 else if (tokenizer.ttype == StreamTokenizer.TT_EOF)
233                 {
234                     // Keep this token in the tokenizer for hasNext()
235                     tokenizer.pushBack();
236                     foundEol = true;
237                 }
238             }
239 
240             setColumnNames(columnNames);
241         }
242     }
243 
244     /***
245      * Determine whether a further row of values exists in the input.
246      *
247      * @return true if the input has more rows.
248      * @exception IOException an IOException occurred.
249      */
250     public boolean hasNextRow()
251             throws IOException
252     {
253         // check for end of line ensures that an empty last line doesn't
254         // give a false positive for hasNextRow
255         if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
256         {
257             tokenizer.nextToken();
258             tokenizer.pushBack();
259             neverRead = false;
260         }
261         return tokenizer.ttype != StreamTokenizer.TT_EOF;
262     }
263 
264     /***
265      * Returns a ValueParser object containing the next row of values.
266      *
267      * @return a ValueParser object.
268      * @exception IOException an IOException occurred.
269      * @exception NoSuchElementException there are no more rows in the input.
270      */
271     public ValueParser nextRow()
272             throws IOException, NoSuchElementException
273     {
274         if (!hasNextRow())
275         {
276             throw new NoSuchElementException();
277         }
278 
279         if (lineValues == null)
280         {
281             lineValues = new BaseValueParser(characterEncoding);
282         }
283         else
284         {
285             lineValues.clear();
286         }
287 
288         Iterator it = columnNames.iterator();
289 
290         String currVal = "";
291         String colName = null;
292 
293         boolean foundEol = false;
294         while (!foundEol || it.hasNext())
295         {
296             if (!foundEol)
297             {
298                 tokenizer.nextToken();
299             }
300 
301             if (colName == null && it.hasNext())
302             {
303                 colName = String.valueOf(it.next());
304             }
305 
306             if (tokenizer.ttype == '"'
307                     || tokenizer.ttype == StreamTokenizer.TT_WORD)
308             {
309                 // tokenizer.ttype is either '"' or TT_WORD
310                 currVal = tokenizer.sval;
311             }
312             else
313             {
314                 // fieldSeparator, EOL or EOF
315                 lineValues.add(colName, currVal);
316                 colName = null;
317                 currVal = "";
318             }
319 
320             // EOL and EOF are checked independently from existing fields.
321             if (tokenizer.ttype == StreamTokenizer.TT_EOL)
322             {
323                 foundEol = true;
324             }
325             else if (tokenizer.ttype == StreamTokenizer.TT_EOF)
326             {
327                 // Keep this token in the tokenizer for hasNext()
328                 tokenizer.pushBack();
329                 foundEol = true;
330             }
331         }
332 
333         return lineValues;
334     }
335 
336     /***
337      * Determine whether a further row of values exists in the input.
338      *
339      * @return true if the input has more rows.
340      */
341     public boolean hasNext()
342     {
343         boolean hasNext = false;
344 
345         try
346         {
347             hasNext = hasNextRow();
348         }
349         catch (IOException e)
350         {
351             throw new NestableRuntimeException(e);
352         }
353 
354         return hasNext;
355     }
356 
357     /***
358      * Returns a ValueParser object containing the next row of values.
359      *
360      * @return a ValueParser object as an Object.
361      * @exception NoSuchElementException there are no more rows in the input
362      *                                   or an IOException occurred.
363      */
364     public Object next()
365             throws NoSuchElementException
366     {
367         Object nextRow = null;
368 
369         try
370         {
371             nextRow = nextRow();
372         }
373         catch (IOException e)
374         {
375             throw new NestableRuntimeException(e);
376         }
377 
378         return nextRow;
379     }
380 
381     /***
382      * The optional Iterator.remove method is not supported.
383      *
384      * @exception UnsupportedOperationException the operation is not supported.
385      */
386     public void remove()
387             throws UnsupportedOperationException
388     {
389         throw new UnsupportedOperationException();
390     }
391 }