View Javadoc
1   package org.apache.fulcrum.parser;
2   
3   
4   /*
5    * Licensed to the Apache Software Foundation (ASF) under one
6    * or more contributor license agreements.  See the NOTICE file
7    * distributed with this work for additional information
8    * regarding copyright ownership.  The ASF licenses this file
9    * to you under the Apache License, Version 2.0 (the
10   * "License"); you may not use this file except in compliance
11   * with the License.  You may obtain a copy of the License at
12   *
13   *   http://www.apache.org/licenses/LICENSE-2.0
14   *
15   * Unless required by applicable law or agreed to in writing,
16   * software distributed under the License is distributed on an
17   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18   * KIND, either express or implied.  See the License for the
19   * specific language governing permissions and limitations
20   * under the License.
21   */
22  
23  
24  import java.io.BufferedReader;
25  import java.io.IOException;
26  import java.io.InputStreamReader;
27  import java.io.Reader;
28  import java.io.StreamTokenizer;
29  import java.util.ArrayList;
30  import java.util.Iterator;
31  import java.util.List;
32  import java.util.NoSuchElementException;
33  
34  import org.apache.avalon.framework.logger.LogEnabled;
35  import org.apache.avalon.framework.logger.Logger;
36  
37  /**
38   * DataStreamParser is used to parse a stream with a fixed format and
39   * generate ValueParser objects which can be used to extract the values
40   * in the desired type.
41   *
42   * <p>The class itself is abstract - a concrete subclass which implements
43   * the initTokenizer method such as CSVParser or TSVParser is required
44   * to use the functionality.
45   *
46   * <p>The class implements the java.util.Iterator interface for convenience.
47   * This allows simple use in a Velocity template for example:
48   *
49   * <pre>
50   * #foreach ($row in $datastream)
51   *   Name: $row.Name
52   *   Description: $row.Description
53   * #end
54   * </pre>
55   *
56   * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
57   * @version $Id$
58   */
59  public abstract class DataStreamParser
60      implements Iterator<ValueParser>, LogEnabled
61  {
62      /**
63       * The list of column names.
64       */
65      private List<String>    columnNames;
66  
67      /**
68       * The stream tokenizer for reading values from the input reader.
69       */
70      private final StreamTokenizer tokenizer;
71  
72      /**
73       * The parameter parser holding the values of columns for the current line.
74       */
75      private ValueParser     lineValues;
76  
77      /**
78       * Indicates whether or not the tokenizer has read anything yet.
79       */
80      private boolean         neverRead = true;
81  
82      /**
83       * The character encoding of the input
84       */
85      private String          characterEncoding;
86  
87      /**
88       * Logger to use
89       */
90      protected Logger log;
91  
92      /**
93       * Create a new DataStreamParser instance. Requires a Reader to read the
94       * comma-separated values from, a list of column names and a
95       * character encoding.
96       *
97       * @param in the input reader.
98       * @param columnNames a list of column names.
99       * @param characterEncoding the character encoding of the input.
100      */
101     public DataStreamParser(Reader in, List<String> columnNames,
102             String characterEncoding)
103     {
104         this.columnNames = columnNames;
105         this.characterEncoding = characterEncoding;
106 
107         if (this.characterEncoding == null)
108         {
109             // try and get the characterEncoding from the reader
110             this.characterEncoding = "US-ASCII";
111             try
112             {
113                 this.characterEncoding = ((InputStreamReader)in).getEncoding();
114             }
115             catch (ClassCastException e)
116             {
117                 // ignore
118             }
119         }
120 
121         tokenizer = new StreamTokenizer(new BufferedReader(in));
122         initTokenizer(tokenizer);
123     }
124 
125     /**
126      * Initialize the StreamTokenizer instance used to read the lines
127      * from the input reader. This must be implemented in subclasses to
128      * set up the tokenizing properties.
129      * 
130      * @param tokenizer the StreamTokenizer to use
131      */
132     protected abstract void initTokenizer(StreamTokenizer tokenizer);
133 
134     /**
135      * Provide a logger
136      *
137      * @see org.apache.avalon.framework.logger.LogEnabled#enableLogging(org.apache.avalon.framework.logger.Logger)
138      */
139     public void enableLogging(Logger logger)
140     {
141         this.log = logger.getChildLogger("DataStreamParser");
142     }
143 
144     /**
145      * Set the list of column names explicitly.
146      *
147      * @param columnNames A list of column names.
148      */
149     public void setColumnNames(List<String> columnNames)
150     {
151         this.columnNames = columnNames;
152     }
153 
154     /**
155      * Read the list of column names from the input reader using the
156      * tokenizer.
157      *
158      * @exception IOException an IOException occurred.
159      */
160     public void readColumnNames()
161         throws IOException
162     {
163         columnNames = new ArrayList<String>();
164 
165         neverRead = false;
166         tokenizer.nextToken();
167         while (tokenizer.ttype == StreamTokenizer.TT_WORD
168                || tokenizer.ttype == '"')
169         {
170             columnNames.add(tokenizer.sval);
171             tokenizer.nextToken();
172         }
173     }
174 
175     /**
176      * Determine whether a further row of values exists in the input.
177      *
178      * @return true if the input has more rows.
179      * @exception IOException an IOException occurred.
180      */
181     public boolean hasNextRow()
182         throws IOException
183     {
184         // check for end of line ensures that an empty last line doesn't
185         // give a false positive for hasNextRow
186         if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
187         {
188             tokenizer.nextToken();
189             tokenizer.pushBack();
190             neverRead = false;
191         }
192         return tokenizer.ttype != StreamTokenizer.TT_EOF;
193     }
194 
195     /**
196      * Returns a ValueParser object containing the next row of values.
197      *
198      * @return a ValueParser object.
199      * @exception IOException an IOException occurred.
200      * @exception NoSuchElementException there are no more rows in the input.
201      */
202     public ValueParser nextRow()
203         throws IOException, NoSuchElementException
204     {
205         if (!hasNextRow())
206         {
207             throw new NoSuchElementException();
208         }
209 
210         if (lineValues == null)
211         {
212             lineValues = new BaseValueParser(characterEncoding);
213         }
214         else
215         {
216             lineValues.clear();
217         }
218 
219         Iterator<String> it = columnNames.iterator();
220         tokenizer.nextToken();
221         while (tokenizer.ttype == StreamTokenizer.TT_WORD
222                || tokenizer.ttype == '"')
223         {
224             // note this means that if there are more values than
225             // column names, the extra values are discarded.
226             if (it.hasNext())
227             {
228                 String colname = it.next().toString();
229                 String colval  = tokenizer.sval;
230                 if (log.isDebugEnabled())
231                 {
232                     log.debug("DataStreamParser.nextRow(): " +
233                               colname + '=' + colval);
234                 }
235                 lineValues.add(colname, colval);
236             }
237             tokenizer.nextToken();
238         }
239 
240         return lineValues;
241     }
242 
243     /**
244      * Determine whether a further row of values exists in the input.
245      *
246      * @return true if the input has more rows.
247      */
248     public boolean hasNext()
249     {
250         boolean hasNext = false;
251 
252         try
253         {
254             hasNext = hasNextRow();
255         }
256         catch (IOException e)
257         {
258             log.error("IOException in CSVParser.hasNext", e);
259         }
260 
261         return hasNext;
262     }
263 
264     /**
265      * Returns a ValueParser object containing the next row of values.
266      *
267      * @return a ValueParser object as an Object.
268      * @exception NoSuchElementException there are no more rows in the input
269      *                                   or an IOException occurred.
270      */
271     public ValueParser next()
272         throws NoSuchElementException
273     {
274         ValueParser nextRow = null;
275 
276         try
277         {
278             nextRow = nextRow();
279         }
280         catch (IOException e)
281         {
282             log.error("IOException in CSVParser.next", e);
283             throw new NoSuchElementException();
284         }
285 
286         return nextRow;
287     }
288 
289     /**
290      * The optional Iterator.remove method is not supported.
291      *
292      * @exception UnsupportedOperationException the operation is not supported.
293      */
294     public void remove()
295         throws UnsupportedOperationException
296     {
297         throw new UnsupportedOperationException();
298     }
299 }