001    /*

002     * JNI_SVM-light - A Java Native Interface for SVM-light

003     * 

004     * Copyright (C) 2005 

005     * Tom Crecelius & Martin Theobald 

006     * Max-Planck Institute for Computer Science

007     * 

008     * This program is free software; you can redistribute it and/or modify it under

009     * the terms of the GNU General Public License as published by the Free Software

010     * Foundation.

011     * 

012     * This program is distributed in the hope that it will be useful, but WITHOUT

013     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS

014     * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more

015     * details.

016     * 

017     * You should have received a copy of the GNU General Public License along with

018     * this program; if not, write to the Free Software Foundation, Inc., 51

019     * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

020     */

021    

022    package jnisvmlight;

023    

024    import java.io.BufferedReader;

025    import java.io.IOException;

026    import java.io.InputStreamReader;

027    import java.net.URL;

028    import java.text.ParseException;

029    import java.util.ArrayList;

030    import java.util.StringTokenizer;

031    

032    /**

033     * The main interface class that transfers the training data to the SVM-light

034     * library by a native call. Optionally takes as input an individually modified

035     * set of training parameters or an array of string paramaters that exactly

036     * simulate the command line input parameters used by the SVM-light binaries.

037     * This class can also be used for native classification calls.

038     * 

039     * @author Tom Crecelius & Martin Theobald

040     */

041    public class SVMLightInterface {

042    

043      /**

044       * Apply an in-place quicksort prior to each native training call to

045       * SVM-light. SVM-light requires each input feature vector to be sorted in

046       * ascending order of dimensions. Disable this option if you are sure to

047       * provide sorted vectors already.

048       */

049      public static boolean SORT_INPUT_VECTORS = true;

050    

051      static {

052        System.loadLibrary("svmlight");

053      }

054    

055      /**

056       * Reads a set of labeled training vectors from a URL. The format is

057       * compatible to the SVM-light training files.

058       */

059      public static LabeledFeatureVector[] getLabeledFeatureVectorsFromURL(

060          URL file, int numOfLinesToSkip) throws ParseException {

061    

062        LabeledFeatureVector[] traindata = null;

063        try {

064          ArrayList data = new ArrayList();

065          BufferedReader bi = new BufferedReader(new InputStreamReader(file

066              .openStream()));

067    

068          String line = null;

069          int cnt = 0;

070          while ((line = bi.readLine()) != null) {

071            cnt++;

072            if (cnt <= numOfLinesToSkip) {

073              continue;

074            }

075            String label = null;

076            StringTokenizer st = new StringTokenizer(line.trim());

077            if (st.countTokens() > 1) {

078              label = (String) st.nextElement();

079    

080              ArrayList dimlist = new ArrayList();

081              ArrayList vallist = new ArrayList();

082              int tokencnt = 0;

083              while (st.hasMoreElements()) {

084                String dimval = (String) st.nextElement();

085                if (dimval.trim().startsWith("#"))

086                  break;

087    

088                int idx = dimval.indexOf(':');

089                if (idx >= 0) {

090                  String dim = dimval.substring(0, idx);

091                  String val = dimval.substring(idx + 1, dimval.length());

092                  dimlist.add(dim);

093                  vallist.add(val);

094                } else {

095                  throw new ParseException("Parse error in FeatureVector of file '"

096                      + file.toString() + "' at line: " + cnt + ", token: "

097                      + tokencnt + ". Could not estimate a \"int:double\" pair ?! "

098                      + file.toString()

099                      + " contains a wrongly defined feature vector!", 0);

100                }

101                tokencnt++;

102              }

103              if (dimlist.size() > 0) {

104                double labelvalue = new Double(label).doubleValue();

105                int[] dimarray = new int[dimlist.size()];

106                double[] valarray = new double[vallist.size()];

107                for (int i = 0; i < dimlist.size(); i++) {

108                  dimarray[i] = new Integer((String) dimlist.get(i)).intValue();

109                }

110                for (int i = 0; i < vallist.size(); i++) {

111                  valarray[i] = new Double((String) vallist.get(i)).doubleValue();

112                }

113                data.add(new LabeledFeatureVector(labelvalue, dimarray, valarray));

114              }

115            } else {

116              throw new ParseException("Parse error in FeatureVector of file '"

117                  + file.toString() + "' at line: " + cnt + ". "

118                  + " Wrong format of the labeled feature vector?", 0);

119            }

120          }

121          if (data.size() > 0) {

122            traindata = new LabeledFeatureVector[data.size()];

123            for (int i = 0; i < data.size(); i++) {

124              traindata[i] = (LabeledFeatureVector) data.get(i);

125            }

126          } else {

127            throw new ParseException("No labeled features found within " + cnt

128                + "lines of file '" + file.toString() + "'.", 0);

129          }

130        } catch (IOException ioe) {

131          ioe.printStackTrace();

132        }

133        return traindata;

134      }

135    

136      protected TrainingParameters m_tp;

137    

138      /**

139       * Performs a classifcation step as a native call to SVM-light. If this method

140       * is used exlusively, no additional SVMLightModel object has to be kept in

141       * the Java runtime process.

142       */

143      public native double classifyNative(FeatureVector doc);

144    

145      public TrainingParameters getTrainingParameters() {

146        return m_tp;

147      }

148    

149      private int partition(int[] dims, double[] vals, int low, int high) {

150        Object pivot;

151        double pivotprim = 0;

152        int i = low - 1;

153        int j = high + 1;

154        pivotprim = dims[(low + high) / 2];

155        while (i < j) {

156          i++;

157          while (dims[i] < pivotprim)

158            i++;

159          j--;

160          while (dims[j] > pivotprim)

161            j--;

162          if (i < j) {

163            int tmp = dims[i];

164            dims[i] = dims[j];

165            dims[j] = tmp;

166            double tmpd = vals[i];

167            vals[i] = vals[j];

168            vals[j] = tmpd;

169          }

170        }

171        return j;

172      }

173    

174      private void quicksort(int[] dims, double[] vals, int low, int high) {

175        if (low >= high)

176          return;

177        int p = partition(dims, vals, low, high);

178        quicksort(dims, vals, low, p);

179        quicksort(dims, vals, p + 1, high);

180      }

181    

182      private void sort(FeatureVector[] trainingData) {

183        for (int i = 0; i < trainingData.length; i++) {

184          if (trainingData[i] != null)

185            quicksort(trainingData[i].m_dims, trainingData[i].m_vals, 0,

186                trainingData[i].size() - 1);

187        }

188      }

189    

190      private native SVMLightModel trainmodel(LabeledFeatureVector[] traindata,

191          TrainingParameters p);

192    

193      public SVMLightModel trainModel(LabeledFeatureVector[] trainingData) {

194        this.m_tp = new TrainingParameters();

195        if (SORT_INPUT_VECTORS) {

196          sort(trainingData);

197        }

198        return trainmodel(trainingData, m_tp);

199      }

200    

201      public SVMLightModel trainModel(LabeledFeatureVector[] trainingData,

202          String[] argv) {

203        this.m_tp = new TrainingParameters(argv);

204        if (SORT_INPUT_VECTORS) {

205          sort(trainingData);

206        }

207        return trainmodel(trainingData, m_tp);

208      }

209    

210      public SVMLightModel trainModel(LabeledFeatureVector[] trainingData,

211          TrainingParameters tp) {

212        this.m_tp = tp;

213        if (SORT_INPUT_VECTORS) {

214          sort(trainingData);

215        }

216        return trainmodel(trainingData, m_tp);

217      }

218    }