1 package org.apache.fulcrum.parser;
2
3
4 /*
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
12 *
13 * http://www.apache.org/licenses/LICENSE-2.0
14 *
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
20 * under the License.
21 */
22
23
24 import java.io.BufferedReader;
25 import java.io.IOException;
26 import java.io.InputStreamReader;
27 import java.io.Reader;
28 import java.io.StreamTokenizer;
29 import java.util.ArrayList;
30 import java.util.Iterator;
31 import java.util.List;
32 import java.util.NoSuchElementException;
33
34 import org.apache.avalon.framework.logger.LogEnabled;
35 import org.apache.avalon.framework.logger.Logger;
36
37 /**
38 * DataStreamParser is used to parse a stream with a fixed format and
39 * generate ValueParser objects which can be used to extract the values
40 * in the desired type.
41 *
42 * <p>The class itself is abstract - a concrete subclass which implements
43 * the initTokenizer method such as CSVParser or TSVParser is required
44 * to use the functionality.
45 *
46 * <p>The class implements the java.util.Iterator interface for convenience.
47 * This allows simple use in a Velocity template for example:
48 *
49 * <pre>
50 * #foreach ($row in $datastream)
51 * Name: $row.Name
52 * Description: $row.Description
53 * #end
54 * </pre>
55 *
56 * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
57 * @version $Id$
58 */
59 public abstract class DataStreamParser
60 implements Iterator<ValueParser>, LogEnabled
61 {
62 /**
63 * The list of column names.
64 */
65 private List<String> columnNames;
66
67 /**
68 * The stream tokenizer for reading values from the input reader.
69 */
70 private final StreamTokenizer tokenizer;
71
72 /**
73 * The parameter parser holding the values of columns for the current line.
74 */
75 private ValueParser lineValues;
76
77 /**
78 * Indicates whether or not the tokenizer has read anything yet.
79 */
80 private boolean neverRead = true;
81
82 /**
83 * The character encoding of the input
84 */
85 private String characterEncoding;
86
87 /**
88 * Logger to use
89 */
90 protected Logger log;
91
92 /**
93 * Create a new DataStreamParser instance. Requires a Reader to read the
94 * comma-separated values from, a list of column names and a
95 * character encoding.
96 *
97 * @param in the input reader.
98 * @param columnNames a list of column names.
99 * @param characterEncoding the character encoding of the input.
100 */
101 public DataStreamParser(Reader in, List<String> columnNames,
102 String characterEncoding)
103 {
104 this.columnNames = columnNames;
105 this.characterEncoding = characterEncoding;
106
107 if (this.characterEncoding == null)
108 {
109 // try and get the characterEncoding from the reader
110 this.characterEncoding = "US-ASCII";
111 try
112 {
113 this.characterEncoding = ((InputStreamReader)in).getEncoding();
114 }
115 catch (ClassCastException e)
116 {
117 // ignore
118 }
119 }
120
121 tokenizer = new StreamTokenizer(new BufferedReader(in));
122 initTokenizer(tokenizer);
123 }
124
125 /**
126 * Initialize the StreamTokenizer instance used to read the lines
127 * from the input reader. This must be implemented in subclasses to
128 * set up the tokenizing properties.
129 *
130 * @param tokenizer the StreamTokenizer to use
131 */
132 protected abstract void initTokenizer(StreamTokenizer tokenizer);
133
134 /**
135 * Provide a logger
136 *
137 * @see org.apache.avalon.framework.logger.LogEnabled#enableLogging(org.apache.avalon.framework.logger.Logger)
138 */
139 public void enableLogging(Logger logger)
140 {
141 this.log = logger.getChildLogger("DataStreamParser");
142 }
143
144 /**
145 * Set the list of column names explicitly.
146 *
147 * @param columnNames A list of column names.
148 */
149 public void setColumnNames(List<String> columnNames)
150 {
151 this.columnNames = columnNames;
152 }
153
154 /**
155 * Read the list of column names from the input reader using the
156 * tokenizer.
157 *
158 * @exception IOException an IOException occurred.
159 */
160 public void readColumnNames()
161 throws IOException
162 {
163 columnNames = new ArrayList<String>();
164
165 neverRead = false;
166 tokenizer.nextToken();
167 while (tokenizer.ttype == StreamTokenizer.TT_WORD
168 || tokenizer.ttype == '"')
169 {
170 columnNames.add(tokenizer.sval);
171 tokenizer.nextToken();
172 }
173 }
174
175 /**
176 * Determine whether a further row of values exists in the input.
177 *
178 * @return true if the input has more rows.
179 * @exception IOException an IOException occurred.
180 */
181 public boolean hasNextRow()
182 throws IOException
183 {
184 // check for end of line ensures that an empty last line doesn't
185 // give a false positive for hasNextRow
186 if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
187 {
188 tokenizer.nextToken();
189 tokenizer.pushBack();
190 neverRead = false;
191 }
192 return tokenizer.ttype != StreamTokenizer.TT_EOF;
193 }
194
195 /**
196 * Returns a ValueParser object containing the next row of values.
197 *
198 * @return a ValueParser object.
199 * @exception IOException an IOException occurred.
200 * @exception NoSuchElementException there are no more rows in the input.
201 */
202 public ValueParser nextRow()
203 throws IOException, NoSuchElementException
204 {
205 if (!hasNextRow())
206 {
207 throw new NoSuchElementException();
208 }
209
210 if (lineValues == null)
211 {
212 lineValues = new BaseValueParser(characterEncoding);
213 }
214 else
215 {
216 lineValues.clear();
217 }
218
219 Iterator<String> it = columnNames.iterator();
220 tokenizer.nextToken();
221 while (tokenizer.ttype == StreamTokenizer.TT_WORD
222 || tokenizer.ttype == '"')
223 {
224 // note this means that if there are more values than
225 // column names, the extra values are discarded.
226 if (it.hasNext())
227 {
228 String colname = it.next().toString();
229 String colval = tokenizer.sval;
230 if (log.isDebugEnabled())
231 {
232 log.debug("DataStreamParser.nextRow(): " +
233 colname + '=' + colval);
234 }
235 lineValues.add(colname, colval);
236 }
237 tokenizer.nextToken();
238 }
239
240 return lineValues;
241 }
242
243 /**
244 * Determine whether a further row of values exists in the input.
245 *
246 * @return true if the input has more rows.
247 */
248 public boolean hasNext()
249 {
250 boolean hasNext = false;
251
252 try
253 {
254 hasNext = hasNextRow();
255 }
256 catch (IOException e)
257 {
258 log.error("IOException in CSVParser.hasNext", e);
259 }
260
261 return hasNext;
262 }
263
264 /**
265 * Returns a ValueParser object containing the next row of values.
266 *
267 * @return a ValueParser object as an Object.
268 * @exception NoSuchElementException there are no more rows in the input
269 * or an IOException occurred.
270 */
271 public ValueParser next()
272 throws NoSuchElementException
273 {
274 ValueParser nextRow = null;
275
276 try
277 {
278 nextRow = nextRow();
279 }
280 catch (IOException e)
281 {
282 log.error("IOException in CSVParser.next", e);
283 throw new NoSuchElementException();
284 }
285
286 return nextRow;
287 }
288
289 /**
290 * The optional Iterator.remove method is not supported.
291 *
292 * @exception UnsupportedOperationException the operation is not supported.
293 */
294 public void remove()
295 throws UnsupportedOperationException
296 {
297 throw new UnsupportedOperationException();
298 }
299 }