001/*
002 * Copyright 2006-2014 the original author or authors.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *      https://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.springframework.batch.item.file.transform;
018
019import java.util.ArrayList;
020import java.util.Collection;
021import java.util.HashSet;
022import java.util.List;
023
024import org.springframework.beans.factory.InitializingBean;
025import org.springframework.util.Assert;
026import org.springframework.util.StringUtils;
027
028/**
029 * A {@link LineTokenizer} implementation that splits the input String on a
030 * configurable delimiter. This implementation also supports the use of an
031 * escape character to escape delimiters and line endings.
032 *
033 * @author Rob Harrop
034 * @author Dave Syer
035 * @author Michael Minella
036 */
037public class DelimitedLineTokenizer extends AbstractLineTokenizer
038        implements InitializingBean {
039        /**
040         * Convenient constant for the common case of a tab delimiter.
041         */
042        public static final String DELIMITER_TAB = "\t";
043
044        /**
045         * Convenient constant for the common case of a comma delimiter.
046         */
047        public static final String DELIMITER_COMMA = ",";
048
049        /**
050         * Convenient constant for the common case of a " character used to escape
051         * delimiters or line endings.
052         */
053        public static final char DEFAULT_QUOTE_CHARACTER = '"';
054
055        // the delimiter character used when reading input.
056        private String delimiter;
057
058        private char quoteCharacter = DEFAULT_QUOTE_CHARACTER;
059
060        private String quoteString;
061
062    private String escapedQuoteString;
063
064        private Collection<Integer> includedFields = null;
065
066        /**
067         * Create a new instance of the {@link DelimitedLineTokenizer} class for the
068         * common case where the delimiter is a {@link #DELIMITER_COMMA comma}.
069         *
070         * @see #DelimitedLineTokenizer(String)
071         * @see #DELIMITER_COMMA
072         */
073        public DelimitedLineTokenizer() {
074                this(DELIMITER_COMMA);
075        }
076
077        /**
078         * Create a new instance of the {@link DelimitedLineTokenizer} class.
079         *
080         * @param delimiter the desired delimiter.  This is required
081         */
082        public DelimitedLineTokenizer(String delimiter) {
083                Assert.notNull(delimiter, "A delimiter is required");
084                Assert.state(!delimiter.equals(String.valueOf(DEFAULT_QUOTE_CHARACTER)), "[" + DEFAULT_QUOTE_CHARACTER
085                                + "] is not allowed as delimiter for tokenizers.");
086
087                this.delimiter = delimiter;
088                setQuoteCharacter(DEFAULT_QUOTE_CHARACTER);
089        }
090
091        /**
092         * Setter for the delimiter character.
093         *
094         * @param delimiter the String used as a delimiter
095         */
096        public void setDelimiter(String delimiter) {
097                this.delimiter = delimiter;
098        }
099
100        /**
101         * The fields to include in the output by position (starting at 0). By
102         * default all fields are included, but this property can be set to pick out
103         * only a few fields from a larger set. Note that if field names are
104         * provided, their number must match the number of included fields.
105         *
106         * @param includedFields the included fields to set
107         */
108        public void setIncludedFields(int... includedFields) {
109                this.includedFields = new HashSet<>();
110                for (int i : includedFields) {
111                        this.includedFields.add(i);
112                }
113        }
114
115        /**
116         * Public setter for the quoteCharacter. The quote character can be used to
117         * extend a field across line endings or to enclose a String which contains
118         * the delimiter. Inside a quoted token the quote character can be used to
119         * escape itself, thus "a""b""c" is tokenized to a"b"c.
120         *
121         * @param quoteCharacter the quoteCharacter to set
122         *
123         * @see #DEFAULT_QUOTE_CHARACTER
124         */
125        public void setQuoteCharacter(char quoteCharacter) {
126                this.quoteCharacter = quoteCharacter;
127                this.quoteString = "" + quoteCharacter;
128        this.escapedQuoteString = "" + quoteCharacter + quoteCharacter;
129        }
130
131        /**
132         * Yields the tokens resulting from the splitting of the supplied
133         * <code>line</code>.
134         *
135         * @param line the line to be tokenized
136         *
137         * @return the resulting tokens
138         */
139        @Override
140        protected List<String> doTokenize(String line) {
141
142                List<String> tokens = new ArrayList<>();
143
144                // line is never null in current implementation
145                // line is checked in parent: AbstractLineTokenizer.tokenize()
146                char[] chars = line.toCharArray();
147                boolean inQuoted = false;
148                int lastCut = 0;
149                int length = chars.length;
150                int fieldCount = 0;
151                int endIndexLastDelimiter = -1;
152
153                for (int i = 0; i < length; i++) {
154                        char currentChar = chars[i];
155                        boolean isEnd = (i == (length - 1));
156
157            boolean isDelimiter = endsWithDelimiter(chars, i, endIndexLastDelimiter);
158
159                        if ((isDelimiter && !inQuoted) || isEnd) {
160                                endIndexLastDelimiter = i;
161                                int endPosition = (isEnd ? (length - lastCut) : (i - lastCut));
162
163                                if (isEnd && isDelimiter) {
164                                        endPosition = endPosition - delimiter.length();
165                                }
166                                else if (!isEnd){
167                                        endPosition = (endPosition - delimiter.length()) + 1;
168                                }
169
170                                if (includedFields == null || includedFields.contains(fieldCount)) {
171                    String value =
172                            substringWithTrimmedWhitespaceAndQuotesIfQuotesPresent(chars, lastCut, endPosition);
173                                        tokens.add(value);
174                                }
175
176                                fieldCount++;
177
178                                if (isEnd && (isDelimiter)) {
179                                        if (includedFields == null || includedFields.contains(fieldCount)) {
180                                                tokens.add("");
181                                        }
182                                        fieldCount++;
183                                }
184
185                                lastCut = i + 1;
186                        }
187                        else if (isQuoteCharacter(currentChar)) {
188                                inQuoted = !inQuoted;
189                        }
190
191                }
192
193                return tokens;
194        }
195
196    /**
197     * Trim any leading or trailing quotes (and any leading or trailing
198     * whitespace before or after the quotes) from within the specified character
199     * array beginning at the specified offset index for the specified count.
200     * <p/>
201     * Quotes are escaped with double instances of the quote character.
202     *
203     * @param chars  the character array
204     * @param offset index from which to begin extracting substring
205     * @param count  length of substring
206     * @return a substring from the specified offset within the character array
207     * with any leading or trailing whitespace trimmed.
208     * @see String#trim()
209     */
210    private String substringWithTrimmedWhitespaceAndQuotesIfQuotesPresent(char chars[], int offset, int count) {
211        int start = offset;
212        int len = count;
213
214        while ((start < (start + len - 1)) && (chars[start] <= ' ')) {
215            start++;
216            len--;
217        }
218
219        while ((start < (start + len)) && ((start + len - 1 < chars.length) && (chars[start + len - 1] <= ' '))) {
220            len--;
221        }
222
223        String value;
224
225        if ((chars.length >= 2) && (chars[start] == quoteCharacter) && (chars[start + len - 1] == quoteCharacter)) {
226            value = new String(chars, start + 1, len - 2);
227            if (value.contains(escapedQuoteString)) {
228                value = StringUtils.replace(value, escapedQuoteString, quoteString);
229            }
230        }
231        else {
232            value = new String(chars, offset, count);
233        }
234
235        return value;
236    }
237
238    /**
239     * Do the character(s) in the specified array end, at the specified end
240     * index, with the delimiter character(s)?
241     * <p/>
242     * Checks that the specified end index is sufficiently greater than the
243     * specified previous delimiter end index to warrant trying to match
244     * another delimiter.  Also checks that the specified end index is
245     * sufficiently large to be able to match the length of a delimiter.
246     *
247     * @param chars    the character array
248     * @param end      the index in up to which the delimiter should be matched
249     * @param previous the index of the end of the last delimiter
250     * @return <code>true</code> if the character(s) from the specified end
251     * match the delimiter character(s), otherwise false
252     * @see DelimitedLineTokenizer#DelimitedLineTokenizer(String)
253     */
254    private boolean endsWithDelimiter(char[] chars, int end, int previous) {
255        boolean result = false;
256
257        if (end - previous >= delimiter.length()) {
258            if (end >= delimiter.length() - 1) {
259                result = true;
260                for (int j = 0; j < delimiter.length() && (((end - delimiter.length() + 1) + j) < chars.length); j++) {
261                    if (delimiter.charAt(j) != chars[(end - delimiter.length() + 1) + j]) {
262                        result = false;
263                    }
264                }
265            }
266        }
267
268        return result;
269    }
270
271        /**
272         * Is the supplied character a quote character?
273         *
274         * @param c the character to be checked
275         * @return <code>true</code> if the supplied character is an quote character
276         * @see #setQuoteCharacter(char)
277         */
278        protected boolean isQuoteCharacter(char c) {
279                return c == quoteCharacter;
280        }
281
282        @Override
283        public void afterPropertiesSet() throws Exception {
284                Assert.hasLength(this.delimiter, "A delimiter is required");
285        }
286}