001 /*
002 * JNI_SVM-light - A Java Native Interface for SVM-light
003 *
004 * Copyright (C) 2005
005 * Tom Crecelius & Martin Theobald
006 * Max-Planck Institute for Computer Science
007 *
008 * This program is free software; you can redistribute it and/or modify it under
009 * the terms of the GNU General Public License as published by the Free Software
010 * Foundation.
011 *
012 * This program is distributed in the hope that it will be useful, but WITHOUT
013 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
014 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
015 * details.
016 *
017 * You should have received a copy of the GNU General Public License along with
018 * this program; if not, write to the Free Software Foundation, Inc., 51
019 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
020 */
021
022 package jnisvmlight;
023
024 import java.io.BufferedReader;
025 import java.io.IOException;
026 import java.io.InputStreamReader;
027 import java.net.URL;
028 import java.text.ParseException;
029 import java.util.ArrayList;
030 import java.util.StringTokenizer;
031
032 /**
033 * The main interface class that transfers the training data to the SVM-light
034 * library by a native call. Optionally takes as input an individually modified
035 * set of training parameters or an array of string paramaters that exactly
036 * simulate the command line input parameters used by the SVM-light binaries.
037 * This class can also be used for native classification calls.
038 *
039 * @author Tom Crecelius & Martin Theobald
040 */
041 public class SVMLightInterface {
042
043 /**
044 * Apply an in-place quicksort prior to each native training call to
045 * SVM-light. SVM-light requires each input feature vector to be sorted in
046 * ascending order of dimensions. Disable this option if you are sure to
047 * provide sorted vectors already.
048 */
049 public static boolean SORT_INPUT_VECTORS = true;
050
051 static {
052 System.loadLibrary("svmlight");
053 }
054
055 /**
056 * Reads a set of labeled training vectors from a URL. The format is
057 * compatible to the SVM-light training files.
058 */
059 public static LabeledFeatureVector[] getLabeledFeatureVectorsFromURL(
060 URL file, int numOfLinesToSkip) throws ParseException {
061
062 LabeledFeatureVector[] traindata = null;
063 try {
064 ArrayList data = new ArrayList();
065 BufferedReader bi = new BufferedReader(new InputStreamReader(file
066 .openStream()));
067
068 String line = null;
069 int cnt = 0;
070 while ((line = bi.readLine()) != null) {
071 cnt++;
072 if (cnt <= numOfLinesToSkip) {
073 continue;
074 }
075 String label = null;
076 StringTokenizer st = new StringTokenizer(line.trim());
077 if (st.countTokens() > 1) {
078 label = (String) st.nextElement();
079
080 ArrayList dimlist = new ArrayList();
081 ArrayList vallist = new ArrayList();
082 int tokencnt = 0;
083 while (st.hasMoreElements()) {
084 String dimval = (String) st.nextElement();
085 if (dimval.trim().startsWith("#"))
086 break;
087
088 int idx = dimval.indexOf(':');
089 if (idx >= 0) {
090 String dim = dimval.substring(0, idx);
091 String val = dimval.substring(idx + 1, dimval.length());
092 dimlist.add(dim);
093 vallist.add(val);
094 } else {
095 throw new ParseException("Parse error in FeatureVector of file '"
096 + file.toString() + "' at line: " + cnt + ", token: "
097 + tokencnt + ". Could not estimate a \"int:double\" pair ?! "
098 + file.toString()
099 + " contains a wrongly defined feature vector!", 0);
100 }
101 tokencnt++;
102 }
103 if (dimlist.size() > 0) {
104 double labelvalue = new Double(label).doubleValue();
105 int[] dimarray = new int[dimlist.size()];
106 double[] valarray = new double[vallist.size()];
107 for (int i = 0; i < dimlist.size(); i++) {
108 dimarray[i] = new Integer((String) dimlist.get(i)).intValue();
109 }
110 for (int i = 0; i < vallist.size(); i++) {
111 valarray[i] = new Double((String) vallist.get(i)).doubleValue();
112 }
113 data.add(new LabeledFeatureVector(labelvalue, dimarray, valarray));
114 }
115 } else {
116 throw new ParseException("Parse error in FeatureVector of file '"
117 + file.toString() + "' at line: " + cnt + ". "
118 + " Wrong format of the labeled feature vector?", 0);
119 }
120 }
121 if (data.size() > 0) {
122 traindata = new LabeledFeatureVector[data.size()];
123 for (int i = 0; i < data.size(); i++) {
124 traindata[i] = (LabeledFeatureVector) data.get(i);
125 }
126 } else {
127 throw new ParseException("No labeled features found within " + cnt
128 + "lines of file '" + file.toString() + "'.", 0);
129 }
130 } catch (IOException ioe) {
131 ioe.printStackTrace();
132 }
133 return traindata;
134 }
135
136 protected TrainingParameters m_tp;
137
138 /**
139 * Performs a classifcation step as a native call to SVM-light. If this method
140 * is used exlusively, no additional SVMLightModel object has to be kept in
141 * the Java runtime process.
142 */
143 public native double classifyNative(FeatureVector doc);
144
145 public TrainingParameters getTrainingParameters() {
146 return m_tp;
147 }
148
149 private int partition(int[] dims, double[] vals, int low, int high) {
150 Object pivot;
151 double pivotprim = 0;
152 int i = low - 1;
153 int j = high + 1;
154 pivotprim = dims[(low + high) / 2];
155 while (i < j) {
156 i++;
157 while (dims[i] < pivotprim)
158 i++;
159 j--;
160 while (dims[j] > pivotprim)
161 j--;
162 if (i < j) {
163 int tmp = dims[i];
164 dims[i] = dims[j];
165 dims[j] = tmp;
166 double tmpd = vals[i];
167 vals[i] = vals[j];
168 vals[j] = tmpd;
169 }
170 }
171 return j;
172 }
173
174 private void quicksort(int[] dims, double[] vals, int low, int high) {
175 if (low >= high)
176 return;
177 int p = partition(dims, vals, low, high);
178 quicksort(dims, vals, low, p);
179 quicksort(dims, vals, p + 1, high);
180 }
181
182 private void sort(FeatureVector[] trainingData) {
183 for (int i = 0; i < trainingData.length; i++) {
184 if (trainingData[i] != null)
185 quicksort(trainingData[i].m_dims, trainingData[i].m_vals, 0,
186 trainingData[i].size() - 1);
187 }
188 }
189
190 private native SVMLightModel trainmodel(LabeledFeatureVector[] traindata,
191 TrainingParameters p);
192
193 public SVMLightModel trainModel(LabeledFeatureVector[] trainingData) {
194 this.m_tp = new TrainingParameters();
195 if (SORT_INPUT_VECTORS) {
196 sort(trainingData);
197 }
198 return trainmodel(trainingData, m_tp);
199 }
200
201 public SVMLightModel trainModel(LabeledFeatureVector[] trainingData,
202 String[] argv) {
203 this.m_tp = new TrainingParameters(argv);
204 if (SORT_INPUT_VECTORS) {
205 sort(trainingData);
206 }
207 return trainmodel(trainingData, m_tp);
208 }
209
210 public SVMLightModel trainModel(LabeledFeatureVector[] trainingData,
211 TrainingParameters tp) {
212 this.m_tp = tp;
213 if (SORT_INPUT_VECTORS) {
214 sort(trainingData);
215 }
216 return trainmodel(trainingData, m_tp);
217 }
218 }