001    /*
002     * JNI_SVM-light - A Java Native Interface for SVM-light
003     * 
004     * Copyright (C) 2005 
005     * Tom Crecelius & Martin Theobald 
006     * Max-Planck Institute for Computer Science
007     * 
008     * This program is free software; you can redistribute it and/or modify it under
009     * the terms of the GNU General Public License as published by the Free Software
010     * Foundation.
011     * 
012     * This program is distributed in the hope that it will be useful, but WITHOUT
013     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
014     * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
015     * details.
016     * 
017     * You should have received a copy of the GNU General Public License along with
018     * this program; if not, write to the Free Software Foundation, Inc., 51
019     * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
020     */
021    
022    package jnisvmlight;
023    
024    import java.io.BufferedReader;
025    import java.io.IOException;
026    import java.io.InputStreamReader;
027    import java.net.URL;
028    import java.text.ParseException;
029    import java.util.ArrayList;
030    import java.util.StringTokenizer;
031    
032    /**
033     * The main interface class that transfers the training data to the SVM-light
034     * library by a native call. Optionally takes as input an individually modified
035     * set of training parameters or an array of string paramaters that exactly
036     * simulate the command line input parameters used by the SVM-light binaries.
037     * This class can also be used for native classification calls.
038     * 
039     * @author Tom Crecelius & Martin Theobald
040     */
041    public class SVMLightInterface {
042    
043      /**
044       * Apply an in-place quicksort prior to each native training call to
045       * SVM-light. SVM-light requires each input feature vector to be sorted in
046       * ascending order of dimensions. Disable this option if you are sure to
047       * provide sorted vectors already.
048       */
049      public static boolean SORT_INPUT_VECTORS = true;
050    
051      static {
052        System.loadLibrary("svmlight");
053      }
054    
055      /**
056       * Reads a set of labeled training vectors from a URL. The format is
057       * compatible to the SVM-light training files.
058       */
059      public static LabeledFeatureVector[] getLabeledFeatureVectorsFromURL(
060          URL file, int numOfLinesToSkip) throws ParseException {
061    
062        LabeledFeatureVector[] traindata = null;
063        try {
064          ArrayList data = new ArrayList();
065          BufferedReader bi = new BufferedReader(new InputStreamReader(file
066              .openStream()));
067    
068          String line = null;
069          int cnt = 0;
070          while ((line = bi.readLine()) != null) {
071            cnt++;
072            if (cnt <= numOfLinesToSkip) {
073              continue;
074            }
075            String label = null;
076            StringTokenizer st = new StringTokenizer(line.trim());
077            if (st.countTokens() > 1) {
078              label = (String) st.nextElement();
079    
080              ArrayList dimlist = new ArrayList();
081              ArrayList vallist = new ArrayList();
082              int tokencnt = 0;
083              while (st.hasMoreElements()) {
084                String dimval = (String) st.nextElement();
085                if (dimval.trim().startsWith("#"))
086                  break;
087    
088                int idx = dimval.indexOf(':');
089                if (idx >= 0) {
090                  String dim = dimval.substring(0, idx);
091                  String val = dimval.substring(idx + 1, dimval.length());
092                  dimlist.add(dim);
093                  vallist.add(val);
094                } else {
095                  throw new ParseException("Parse error in FeatureVector of file '"
096                      + file.toString() + "' at line: " + cnt + ", token: "
097                      + tokencnt + ". Could not estimate a \"int:double\" pair ?! "
098                      + file.toString()
099                      + " contains a wrongly defined feature vector!", 0);
100                }
101                tokencnt++;
102              }
103              if (dimlist.size() > 0) {
104                double labelvalue = new Double(label).doubleValue();
105                int[] dimarray = new int[dimlist.size()];
106                double[] valarray = new double[vallist.size()];
107                for (int i = 0; i < dimlist.size(); i++) {
108                  dimarray[i] = new Integer((String) dimlist.get(i)).intValue();
109                }
110                for (int i = 0; i < vallist.size(); i++) {
111                  valarray[i] = new Double((String) vallist.get(i)).doubleValue();
112                }
113                data.add(new LabeledFeatureVector(labelvalue, dimarray, valarray));
114              }
115            } else {
116              throw new ParseException("Parse error in FeatureVector of file '"
117                  + file.toString() + "' at line: " + cnt + ". "
118                  + " Wrong format of the labeled feature vector?", 0);
119            }
120          }
121          if (data.size() > 0) {
122            traindata = new LabeledFeatureVector[data.size()];
123            for (int i = 0; i < data.size(); i++) {
124              traindata[i] = (LabeledFeatureVector) data.get(i);
125            }
126          } else {
127            throw new ParseException("No labeled features found within " + cnt
128                + "lines of file '" + file.toString() + "'.", 0);
129          }
130        } catch (IOException ioe) {
131          ioe.printStackTrace();
132        }
133        return traindata;
134      }
135    
136      protected TrainingParameters m_tp;
137    
138      /**
139       * Performs a classifcation step as a native call to SVM-light. If this method
140       * is used exlusively, no additional SVMLightModel object has to be kept in
141       * the Java runtime process.
142       */
143      public native double classifyNative(FeatureVector doc);
144    
145      public TrainingParameters getTrainingParameters() {
146        return m_tp;
147      }
148    
149      private int partition(int[] dims, double[] vals, int low, int high) {
150        Object pivot;
151        double pivotprim = 0;
152        int i = low - 1;
153        int j = high + 1;
154        pivotprim = dims[(low + high) / 2];
155        while (i < j) {
156          i++;
157          while (dims[i] < pivotprim)
158            i++;
159          j--;
160          while (dims[j] > pivotprim)
161            j--;
162          if (i < j) {
163            int tmp = dims[i];
164            dims[i] = dims[j];
165            dims[j] = tmp;
166            double tmpd = vals[i];
167            vals[i] = vals[j];
168            vals[j] = tmpd;
169          }
170        }
171        return j;
172      }
173    
174      private void quicksort(int[] dims, double[] vals, int low, int high) {
175        if (low >= high)
176          return;
177        int p = partition(dims, vals, low, high);
178        quicksort(dims, vals, low, p);
179        quicksort(dims, vals, p + 1, high);
180      }
181    
182      private void sort(FeatureVector[] trainingData) {
183        for (int i = 0; i < trainingData.length; i++) {
184          if (trainingData[i] != null)
185            quicksort(trainingData[i].m_dims, trainingData[i].m_vals, 0,
186                trainingData[i].size() - 1);
187        }
188      }
189    
190      private native SVMLightModel trainmodel(LabeledFeatureVector[] traindata,
191          TrainingParameters p);
192    
193      public SVMLightModel trainModel(LabeledFeatureVector[] trainingData) {
194        this.m_tp = new TrainingParameters();
195        if (SORT_INPUT_VECTORS) {
196          sort(trainingData);
197        }
198        return trainmodel(trainingData, m_tp);
199      }
200    
201      public SVMLightModel trainModel(LabeledFeatureVector[] trainingData,
202          String[] argv) {
203        this.m_tp = new TrainingParameters(argv);
204        if (SORT_INPUT_VECTORS) {
205          sort(trainingData);
206        }
207        return trainmodel(trainingData, m_tp);
208      }
209    
210      public SVMLightModel trainModel(LabeledFeatureVector[] trainingData,
211          TrainingParameters tp) {
212        this.m_tp = tp;
213        if (SORT_INPUT_VECTORS) {
214          sort(trainingData);
215        }
216        return trainmodel(trainingData, m_tp);
217      }
218    }