org.apache.hadoop.mapred.FileOutputFormat.setOutputPath() - java examples

Here are the examples of the java api org.apache.hadoop.mapred.FileOutputFormat.setOutputPath() taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

155 Examples 7

17 View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Count the doreplacedent frequencies of features in parallel using Map/Reduce. The input doreplacedents have to be
 * in {@link SequenceFile} format
 */
private static void startDFCounting(Path input, Path output) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(TFIDFConverter.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName("VectorTfIdf Doreplacedent Frequency Count running over input: " + input.toString());
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(LongWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(TermDoreplacedentCountMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(TermDoreplacedentCountReducer.clreplaced);
    conf.setReducerClreplaced(TermDoreplacedentCountReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

17 View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Count the doreplacedent frequencies of features in parallel using Map/Reduce. The input doreplacedents have to be
 * in {@link SequenceFile} format
 */
private static void startDFCounting(Path input, Path output) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(TFIDFConverter.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName("VectorTfIdf Doreplacedent Frequency Count running over input: " + input.toString());
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(LongWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(TermDoreplacedentCountMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(TermDoreplacedentCountReducer.clreplaced);
    conf.setReducerClreplaced(TermDoreplacedentCountReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : InMemBuilder.java
Copyright Apache License 2.0
Author : ogrisel
@Override
protected void configureJob(JobConf conf, int nbTrees, boolean oobEstimate) throws IOException {
    FileOutputFormat.setOutputPath(conf, getOutputPath(conf));
    // put the data in the DistributedCache
    DistributedCache.addCacheFile(getDataPath().toUri(), conf);
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(MapredOutput.clreplaced);
    conf.setMapperClreplaced(InMemMapper.clreplaced);
    // no reducers
    conf.setNumReduceTasks(0);
    conf.setInputFormat(InMemInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
}

16 View Complete Implementation : InMemBuilder.java
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
protected void configureJob(JobConf conf, int nbTrees, boolean oobEstimate) throws IOException {
    FileOutputFormat.setOutputPath(conf, getOutputPath(conf));
    // put the data in the DistributedCache
    DistributedCache.addCacheFile(getDataPath().toUri(), conf);
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(MapredOutput.clreplaced);
    conf.setMapperClreplaced(InMemMapper.clreplaced);
    // no reducers
    conf.setNumReduceTasks(0);
    conf.setInputFormat(InMemInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
}

16 View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
 * be in the {@link SequenceFile} format
 *
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param featureCount
 *          Number of unique features in the dataset
 * @param vectorCount
 *          Number of vectors in the dataset
 * @param minDf
 *          The minimum doreplacedent frequency. Default 1
 * @param maxDFPercent
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @throws IOException
 */
private static void makePartialVectors(Path input, Long featureCount, Long vectorCount, int minDf, int maxDFPercent, Path dictionaryFilePath, Path output, boolean sequentialAccess) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(TFIDFConverter.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
    conf.setLong(FEATURE_COUNT, featureCount);
    conf.setLong(VECTOR_COUNT, vectorCount);
    conf.setInt(MIN_DF, minDf);
    conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setReducerClreplaced(TFIDFPartialVectorReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : ogrisel
public static void runJob(Path input, Path output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setMapperClreplaced(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.clreplaced);
    conf.setReducerClreplaced(Reducer.clreplaced);
    conf.setNumReduceTasks(0);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : CDbwDriver.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Run the job using supplied arguments
 *
 * @param input
 *          the directory pathname for input points
 * @param stateIn
 *          the directory pathname for input state
 * @param stateOut
 *          the directory pathname for output state
 * @param distanceMeasureClreplaced
 *          the clreplaced name of the DistanceMeasure clreplaced
 * @param numReducers
 *          the number of Reducers desired
 */
public static void runIteration(Path input, Path stateIn, Path stateOut, String distanceMeasureClreplaced, int numReducers) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CDbwDriver.clreplaced);
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    conf.setMapOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setMapOutputValueClreplaced(WeightedVectorWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, stateOut);
    conf.setMapperClreplaced(CDbwMapper.clreplaced);
    conf.setReducerClreplaced(CDbwReducer.clreplaced);
    conf.setNumReduceTasks(numReducers);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set(STATE_IN_KEY, stateIn.toString());
    conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClreplaced);
    client.setConf(conf);
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Run an iteration
 *
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param control
 *          the control path
 * @param measureClreplacedName
 *          the DistanceMeasure clreplaced name
 * @param t1
 *          the T1 distance threshold
 * @param t2
 *          the T2 distance threshold
 * @param convergenceDelta
 *          the double convergence criteria
 */
static void runIteration(Path input, Path output, Path control, String measureClreplacedName, double t1, double t2, double convergenceDelta) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(MeanShiftCanopyMapper.clreplaced);
    conf.setReducerClreplaced(MeanShiftCanopyReducer.clreplaced);
    conf.setNumReduceTasks(1);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClreplacedName);
    conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
    conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
    conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
    conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control.toString());
    client.setConf(conf);
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Run the job using supplied arguments
 *
 * @param input
 *          the directory pathname for input points
 * @param clustersIn
 *          the directory pathname for input clusters
 * @param output
 *          the directory pathname for output clustered points
 */
static void runClustering(Path input, Path clustersIn, Path output) {
    JobConf conf = new JobConf(FuzzyKMeansDriver.clreplaced);
    conf.setJobName("Mean Shift Clustering");
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(WeightedVectorWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(MeanShiftCanopyClusterMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    // uncomment it to run locally
    // conf.set("mapred.job.tracker", "local");
    conf.setNumReduceTasks(0);
    conf.set(STATE_IN_KEY, clustersIn.toString());
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : TestAppendStress.java
Copyright Apache License 2.0
Author : facebookarchive
private void startAppendJob(Configuration conf) throws IOException {
    JobConf job = new JobConf(conf, TestAppendStress.clreplaced);
    job.set(JOB_START_TIME_LABEL, new Date().toString());
    FileInputFormat.setInputPaths(job, CONTROL_DIR);
    FileOutputFormat.setOutputPath(job, APPEND_DIR);
    job.setInputFormat(SequenceFileInputFormat.clreplaced);
    job.setMapperClreplaced(AppendMapper.clreplaced);
    job.setOutputKeyClreplaced(Text.clreplaced);
    job.setOutputValueClreplaced(Text.clreplaced);
    job.setNumReduceTasks(0);
    JobClient.runJob(job);
}

16 View Complete Implementation : CDbwDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Run the job using supplied arguments
 *
 * @param input
 *          the directory pathname for input points
 * @param stateIn
 *          the directory pathname for input state
 * @param stateOut
 *          the directory pathname for output state
 * @param distanceMeasureClreplaced
 *          the clreplaced name of the DistanceMeasure clreplaced
 * @param numReducers
 *          the number of Reducers desired
 */
public static void runIteration(Path input, Path stateIn, Path stateOut, String distanceMeasureClreplaced, int numReducers) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CDbwDriver.clreplaced);
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    conf.setMapOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setMapOutputValueClreplaced(WeightedVectorWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, stateOut);
    conf.setMapperClreplaced(CDbwMapper.clreplaced);
    conf.setReducerClreplaced(CDbwReducer.clreplaced);
    conf.setNumReduceTasks(numReducers);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set(STATE_IN_KEY, stateIn.toString());
    conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClreplaced);
    client.setConf(conf);
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Count the frequencies of words in parallel using Map/Reduce. The input doreplacedents have to be in
 * {@link SequenceFile} format
 */
private static void startWordCounting(Path input, Path output, int minSupport) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input.toString());
    conf.setInt(MIN_SUPPORT, minSupport);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(LongWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(TermCountMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(TermCountReducer.clreplaced);
    conf.setReducerClreplaced(TermCountReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : ogrisel
public static void runJob(Path input, Path output, String vectorClreplacedName) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(InputDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("vector.implementation.clreplaced.name", vectorClreplacedName);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(InputMapper.clreplaced);
    conf.setReducerClreplaced(Reducer.clreplaced);
    conf.setNumReduceTasks(0);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
public static void runJob(Path input, Path output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setMapperClreplaced(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.clreplaced);
    conf.setReducerClreplaced(Reducer.clreplaced);
    conf.setNumReduceTasks(0);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * preplaced2: perform the LLR calculation
 */
public static void computeNGramsPruneByLLR(long nGramTotal, Path output, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException {
    JobConf conf = new JobConf(CollocDriver.clreplaced);
    conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".computeNGrams: " + output);
    conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    conf.setMapOutputKeyClreplaced(Gram.clreplaced);
    conf.setMapOutputValueClreplaced(Gram.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setReducerClreplaced(LLRReducer.clreplaced);
    conf.setNumReduceTasks(reduceTasks);
    conf.setFloat(LLRReducer.MIN_LLR, minLLRValue);
    JobClient.runJob(conf);
}

16 View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Count the frequencies of words in parallel using Map/Reduce. The input doreplacedents have to be in
 * {@link SequenceFile} format
 */
private static void startWordCounting(Path input, Path output, int minSupport) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input.toString());
    conf.setInt(MIN_SUPPORT, minSupport);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(LongWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(TermCountMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(TermCountReducer.clreplaced);
    conf.setReducerClreplaced(TermCountReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Run an iteration
 *
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param control
 *          the control path
 * @param measureClreplacedName
 *          the DistanceMeasure clreplaced name
 * @param t1
 *          the T1 distance threshold
 * @param t2
 *          the T2 distance threshold
 * @param convergenceDelta
 *          the double convergence criteria
 */
static void runIteration(Path input, Path output, Path control, String measureClreplacedName, double t1, double t2, double convergenceDelta) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(MeanShiftCanopyMapper.clreplaced);
    conf.setReducerClreplaced(MeanShiftCanopyReducer.clreplaced);
    conf.setNumReduceTasks(1);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClreplacedName);
    conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
    conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
    conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
    conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control.toString());
    client.setConf(conf);
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
 * be in the {@link SequenceFile} format
 *
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param featureCount
 *          Number of unique features in the dataset
 * @param vectorCount
 *          Number of vectors in the dataset
 * @param minDf
 *          The minimum doreplacedent frequency. Default 1
 * @param maxDFPercent
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @throws IOException
 */
private static void makePartialVectors(Path input, Long featureCount, Long vectorCount, int minDf, int maxDFPercent, Path dictionaryFilePath, Path output, boolean sequentialAccess) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(TFIDFConverter.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
    conf.setLong(FEATURE_COUNT, featureCount);
    conf.setLong(VECTOR_COUNT, vectorCount);
    conf.setInt(MIN_DF, minDf);
    conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setReducerClreplaced(TFIDFPartialVectorReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
static void createCanopyFromVectors(Path input, Path output) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(MeanShiftCanopyCreatorMapper.clreplaced);
    conf.setNumReduceTasks(0);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    client.setConf(conf);
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Run the job using supplied arguments
 *
 * @param input
 *          the directory pathname for input points
 * @param clustersIn
 *          the directory pathname for input clusters
 * @param output
 *          the directory pathname for output clustered points
 */
static void runClustering(Path input, Path clustersIn, Path output) {
    JobConf conf = new JobConf(FuzzyKMeansDriver.clreplaced);
    conf.setJobName("Mean Shift Clustering");
    conf.setOutputKeyClreplaced(IntWritable.clreplaced);
    conf.setOutputValueClreplaced(WeightedVectorWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(MeanShiftCanopyClusterMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    // uncomment it to run locally
    // conf.set("mapred.job.tracker", "local");
    conf.setNumReduceTasks(0);
    conf.set(STATE_IN_KEY, clustersIn.toString());
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : ogrisel
static void createCanopyFromVectors(Path input, Path output) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(MeanShiftCanopyCreatorMapper.clreplaced);
    conf.setNumReduceTasks(0);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    client.setConf(conf);
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        log.warn(e.toString(), e);
    }
}

16 View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
public static void runJob(Path input, Path output, String vectorClreplacedName) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(InputDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("vector.implementation.clreplaced.name", vectorClreplacedName);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(InputMapper.clreplaced);
    conf.setReducerClreplaced(Reducer.clreplaced);
    conf.setNumReduceTasks(0);
    client.setConf(conf);
    JobClient.runJob(conf);
}

16 View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * preplaced2: perform the LLR calculation
 */
public static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException {
    JobConf conf = new JobConf(baseConf, CollocDriver.clreplaced);
    conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".computeNGrams: " + output);
    conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    conf.setMapOutputKeyClreplaced(Gram.clreplaced);
    conf.setMapOutputValueClreplaced(Gram.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
    FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setReducerClreplaced(LLRReducer.clreplaced);
    conf.setNumReduceTasks(reduceTasks);
    conf.setFloat(LLRReducer.MIN_LLR, minLLRValue);
    JobClient.runJob(conf);
}

15 View Complete Implementation : OutputDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(Text.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    conf.setMapperClreplaced(OutputMapper.clreplaced);
    conf.setReducerClreplaced(Reducer.clreplaced);
    conf.setNumReduceTasks(0);
    client.setConf(conf);
    JobClient.runJob(conf);
}

15 View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Create a partial vector using a chunk of features from the input doreplacedents. The input doreplacedents has to be
 * in the {@link SequenceFile} format
 *
 * @param input
 *          input directory of the doreplacedents in {@link SequenceFile} format
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param numReducers
 *          the desired number of reducer tasks
 * @throws IOException
 */
private static void makePartialVectors(Path input, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, int numReducers) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    conf.setMapOutputKeyClreplaced(Text.clreplaced);
    conf.setMapOutputValueClreplaced(StringTuple.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setReducerClreplaced(TFPartialVectorReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setNumReduceTasks(numReducers);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

15 View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Create a partial vector using a chunk of features from the input doreplacedents. The input doreplacedents has to be
 * in the {@link SequenceFile} format
 *
 * @param input
 *          input directory of the doreplacedents in {@link SequenceFile} format
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param numReducers
 *          the desired number of reducer tasks
 * @throws IOException
 */
private static void makePartialVectors(Path input, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, int numReducers) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
    conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    conf.setMapOutputKeyClreplaced(Text.clreplaced);
    conf.setMapOutputValueClreplaced(StringTuple.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(VectorWritable.clreplaced);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setReducerClreplaced(TFPartialVectorReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setNumReduceTasks(numReducers);
    HadoopUtil.overwriteOutput(output);
    client.setConf(conf);
    JobClient.runJob(conf);
}

15 View Complete Implementation : WikipediaToSequenceFile.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Run the job
 *
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 * @param all
 *          if true select all categories
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaToSequenceFile.clreplaced);
    if (WikipediaToSequenceFile.log.isInfoEnabled()) {
        log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
    }
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(Text.clreplaced);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClreplaced(WikipediaMapper.clreplaced);
    conf.setInputFormat(XmlInputFormat.clreplaced);
    conf.setReducerClreplaced(IdenreplacedyReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    HadoopUtil.overwriteOutput(outPath);
    Set<String> categories = new HashSet<String>();
    if (catFile.length() > 0) {
        for (String line : new FileLineIterable(new File(catFile))) {
            categories.add(line.trim().toLowerCase(Locale.ENGLISH));
        }
    }
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
    String categoriesStr = setStringifier.toString(categories);
    conf.set("wikipedia.categories", categoriesStr);
    client.setConf(conf);
    JobClient.runJob(conf);
}

15 View Complete Implementation : WikipediaToSequenceFile.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Run the job
 *
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 * @param all
 *          if true select all categories
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaToSequenceFile.clreplaced);
    if (WikipediaToSequenceFile.log.isInfoEnabled()) {
        log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
    }
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(Text.clreplaced);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClreplaced(WikipediaMapper.clreplaced);
    conf.setInputFormat(XmlInputFormat.clreplaced);
    conf.setReducerClreplaced(IdenreplacedyReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    HadoopUtil.overwriteOutput(outPath);
    Set<String> categories = new HashSet<String>();
    if (catFile.length() > 0) {
        for (String line : new FileLineIterable(new File(catFile))) {
            categories.add(line.trim().toLowerCase());
        }
    }
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
    String categoriesStr = setStringifier.toString(categories);
    conf.set("wikipedia.categories", categoriesStr);
    client.setConf(conf);
    JobClient.runJob(conf);
}

15 View Complete Implementation : TxnGenerator.java
Copyright Apache License 2.0
Author : facebookarchive
private static JobConf createJobConf(Configuration conf) throws IOException {
    JobConf jobConf = new JobConf(conf);
    String jobName = "transaction_generator";
    jobConf.setJobName(jobName);
    String splitDir = workplace + "split/";
    jobConf.set(TEST_DIR_LABEL, workplace);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setJarByClreplaced(TxnGenerator.clreplaced);
    jobConf.setMapperClreplaced(GeneratorMapper.clreplaced);
    jobConf.setInputFormat(TextInputFormat.clreplaced);
    FileInputFormat.addInputPath(jobConf, new Path(splitDir));
    Random random = new Random();
    FileOutputFormat.setOutputPath(jobConf, new Path(workplace, "output" + random.nextLong()));
    jobConf.setNumReduceTasks(0);
    jobConf.setNumMapTasks(numMappers);
    createSplitFiles(conf, new Path(splitDir));
    return jobConf;
}

15 View Complete Implementation : TestFileSystem.java
Copyright Apache License 2.0
Author : facebookarchive
public static void readTest(FileSystem fs, boolean fastCheck) throws Exception {
    fs.delete(READ_DIR, true);
    JobConf job = new JobConf(conf, TestFileSystem.clreplaced);
    job.setBoolean("fs.test.fastCheck", fastCheck);
    FileInputFormat.setInputPaths(job, CONTROL_DIR);
    job.setInputFormat(SequenceFileInputFormat.clreplaced);
    job.setMapperClreplaced(ReadMapper.clreplaced);
    job.setReducerClreplaced(LongSumReducer.clreplaced);
    FileOutputFormat.setOutputPath(job, READ_DIR);
    job.setOutputKeyClreplaced(UTF8.clreplaced);
    job.setOutputValueClreplaced(LongWritable.clreplaced);
    job.setNumReduceTasks(1);
    JobClient.runJob(job);
}

15 View Complete Implementation : TestFileSystem.java
Copyright Apache License 2.0
Author : facebookarchive
public static void writeTest(FileSystem fs, boolean fastCheck) throws Exception {
    fs.delete(DATA_DIR, true);
    fs.delete(WRITE_DIR, true);
    JobConf job = new JobConf(conf, TestFileSystem.clreplaced);
    job.setBoolean("fs.test.fastCheck", fastCheck);
    FileInputFormat.setInputPaths(job, CONTROL_DIR);
    job.setInputFormat(SequenceFileInputFormat.clreplaced);
    job.setMapperClreplaced(WriteMapper.clreplaced);
    job.setReducerClreplaced(LongSumReducer.clreplaced);
    FileOutputFormat.setOutputPath(job, WRITE_DIR);
    job.setOutputKeyClreplaced(UTF8.clreplaced);
    job.setOutputValueClreplaced(LongWritable.clreplaced);
    job.setNumReduceTasks(1);
    JobClient.runJob(job);
}

15 View Complete Implementation : ExternalMapReduce.java
Copyright Apache License 2.0
Author : facebookarchive
public int run(String[] argv) throws IOException {
    if (argv.length < 2) {
        System.out.println("ExternalMapReduce <input> <output>");
        return -1;
    }
    Path outDir = new Path(argv[1]);
    Path input = new Path(argv[0]);
    JobConf testConf = new JobConf(getConf(), ExternalMapReduce.clreplaced);
    // try to load a clreplaced from libjar
    try {
        testConf.getClreplacedByName("testjar.ClreplacedWordCount");
    } catch (ClreplacedNotFoundException e) {
        System.out.println("Could not find clreplaced from libjar");
        return -1;
    }
    testConf.setJobName("external job");
    FileInputFormat.setInputPaths(testConf, input);
    FileOutputFormat.setOutputPath(testConf, outDir);
    testConf.setMapperClreplaced(MapClreplaced.clreplaced);
    testConf.setReducerClreplaced(Reduce.clreplaced);
    testConf.setNumReduceTasks(1);
    JobClient.runJob(testConf);
    return 0;
}

15 View Complete Implementation : TestFileSystem.java
Copyright Apache License 2.0
Author : facebookarchive
public static void seekTest(FileSystem fs, boolean fastCheck) throws Exception {
    fs.delete(READ_DIR, true);
    JobConf job = new JobConf(conf, TestFileSystem.clreplaced);
    job.setBoolean("fs.test.fastCheck", fastCheck);
    FileInputFormat.setInputPaths(job, CONTROL_DIR);
    job.setInputFormat(SequenceFileInputFormat.clreplaced);
    job.setMapperClreplaced(SeekMapper.clreplaced);
    job.setReducerClreplaced(LongSumReducer.clreplaced);
    FileOutputFormat.setOutputPath(job, READ_DIR);
    job.setOutputKeyClreplaced(UTF8.clreplaced);
    job.setOutputValueClreplaced(LongWritable.clreplaced);
    job.setNumReduceTasks(1);
    JobClient.runJob(job);
}

15 View Complete Implementation : OutputDriver.java
Copyright Apache License 2.0
Author : ogrisel
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(Text.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    conf.setMapperClreplaced(OutputMapper.clreplaced);
    conf.setReducerClreplaced(Reducer.clreplaced);
    conf.setNumReduceTasks(0);
    client.setConf(conf);
    JobClient.runJob(conf);
}

14 View Complete Implementation : BayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.clreplaced);
    conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);
    conf.setOutputKeyClreplaced(StringTuple.clreplaced);
    conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClreplaced(BayesThetaNormalizerMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(BayesThetaNormalizerReducer.clreplaced);
    conf.setReducerClreplaced(BayesThetaNormalizerReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    HadoopUtil.overwriteOutput(outPath);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
    Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);
    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);
    JobClient.runJob(conf);
}

14 View Complete Implementation : CBayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : ogrisel
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CBayesThetaNormalizerDriver.clreplaced);
    conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);
    conf.setOutputKeyClreplaced(StringTuple.clreplaced);
    conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClreplaced(CBayesThetaNormalizerMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
    conf.setReducerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);
    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
    Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);
    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);
    JobClient.runJob(conf);
}

14 View Complete Implementation : BayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : ogrisel
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.clreplaced);
    conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);
    conf.setOutputKeyClreplaced(StringTuple.clreplaced);
    conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClreplaced(BayesThetaNormalizerMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(BayesThetaNormalizerReducer.clreplaced);
    conf.setReducerClreplaced(BayesThetaNormalizerReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    HadoopUtil.overwriteOutput(outPath);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
    Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);
    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);
    JobClient.runJob(conf);
}

14 View Complete Implementation : CBayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CBayesThetaNormalizerDriver.clreplaced);
    conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);
    conf.setOutputKeyClreplaced(StringTuple.clreplaced);
    conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClreplaced(CBayesThetaNormalizerMapper.clreplaced);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setCombinerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
    conf.setReducerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);
    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
    Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);
    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);
    JobClient.runJob(conf);
}

14 View Complete Implementation : PartialBuilder.java
Copyright Apache License 2.0
Author : ogrisel
@Override
protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException {
    FileInputFormat.setInputPaths(job, getDataPath());
    FileOutputFormat.setOutputPath(job, getOutputPath(job));
    job.setOutputKeyClreplaced(TreeID.clreplaced);
    job.setOutputValueClreplaced(MapredOutput.clreplaced);
    job.setMapperClreplaced(Step1Mapper.clreplaced);
    // no reducers
    job.setNumReduceTasks(0);
    job.setInputFormat(TextInputFormat.clreplaced);
    job.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    // if we are in 'local' mode, correct the number of maps
    // or the mappers won't be able to compute the right indexes
    String tracker = job.get("mapred.job.tracker", "local");
    if ("local".equals(tracker)) {
        log.warn("Hadoop running in 'local' mode, only one map task will be launched");
        job.setNumMapTasks(1);
    }
}

14 View Complete Implementation : TxnConsumer.java
Copyright Apache License 2.0
Author : facebookarchive
private JobConf createJobConf(Configuration conf2) throws IOException {
    JobConf jobConf = new JobConf(conf);
    String jobName = "transaction_consumer";
    jobConf.setJobName(jobName);
    String splitDir = workplace + "split/";
    jobConf.set(TEST_DIR_LABEL, workplace);
    jobConf.set(NOTIFIER_SERVER_ADDR_KEY, notifierServerAddrStr);
    jobConf.set(NOTIFIER_SERVER_PORT_KEY, notifierServerPortStr);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setReduceSpeculativeExecution(false);
    jobConf.setJarByClreplaced(TxnConsumer.clreplaced);
    jobConf.setMapperClreplaced(ConsumerMapper.clreplaced);
    jobConf.setReducerClreplaced(ConsumerReducer.clreplaced);
    jobConf.setMapOutputKeyClreplaced(Text.clreplaced);
    jobConf.setMapOutputValueClreplaced(Text.clreplaced);
    jobConf.setOutputKeyClreplaced(Text.clreplaced);
    jobConf.setOutputValueClreplaced(Text.clreplaced);
    jobConf.setInputFormat(TextInputFormat.clreplaced);
    jobConf.setOutputFormat(TextOutputFormat.clreplaced);
    FileInputFormat.addInputPath(jobConf, new Path(splitDir));
    Random random = new Random();
    FileOutputFormat.setOutputPath(jobConf, new Path(workplace, "output" + random.nextLong()));
    jobConf.setNumMapTasks(numMappers);
    createSplitFiles(conf, new Path(splitDir));
    return jobConf;
}

14 View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * preplaced1: generate collocations, ngrams
 */
public static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException {
    JobConf conf = new JobConf(baseConf, CollocDriver.clreplaced);
    conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".generateCollocations:" + input);
    conf.setMapOutputKeyClreplaced(GramKey.clreplaced);
    conf.setMapOutputValueClreplaced(Gram.clreplaced);
    conf.setParreplacedionerClreplaced(GramKeyParreplacedioner.clreplaced);
    conf.setOutputValueGroupingComparator(GramKeyGroupComparator.clreplaced);
    conf.setOutputKeyClreplaced(Gram.clreplaced);
    conf.setOutputValueClreplaced(Gram.clreplaced);
    conf.setCombinerClreplaced(CollocCombiner.clreplaced);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    FileInputFormat.setInputPaths(conf, input);
    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(conf, outputPath);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setMapperClreplaced(CollocMapper.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setReducerClreplaced(CollocReducer.clreplaced);
    conf.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    conf.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    conf.setNumReduceTasks(reduceTasks);
    RunningJob job = JobClient.runJob(conf);
    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

14 View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * preplaced1: generate collocations, ngrams
 */
public static long generateCollocations(Path input, Path output, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException {
    JobConf conf = new JobConf(CollocDriver.clreplaced);
    conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".generateCollocations:" + input);
    conf.setMapOutputKeyClreplaced(GramKey.clreplaced);
    conf.setMapOutputValueClreplaced(Gram.clreplaced);
    conf.setParreplacedionerClreplaced(GramKeyParreplacedioner.clreplaced);
    conf.setOutputValueGroupingComparator(GramKeyGroupComparator.clreplaced);
    conf.setOutputKeyClreplaced(Gram.clreplaced);
    conf.setOutputValueClreplaced(Gram.clreplaced);
    conf.setCombinerClreplaced(CollocCombiner.clreplaced);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    FileInputFormat.setInputPaths(conf, input);
    Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setMapperClreplaced(CollocMapper.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    conf.setReducerClreplaced(CollocReducer.clreplaced);
    conf.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    conf.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    conf.setNumReduceTasks(reduceTasks);
    RunningJob job = JobClient.runJob(conf);
    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

14 View Complete Implementation : WikipediaDatasetCreatorDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Run the job
 *
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, Clreplaced<? extends replacedyzer> replacedyzerClreplaced) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.clreplaced);
    if (WikipediaDatasetCreatorDriver.log.isInfoEnabled()) {
        log.info("Input: {} Out: {} Categories: {}", new Object[] { input, output, catFile });
    }
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
    conf.set("xmlinput.end", "</text>");
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(Text.clreplaced);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("replacedyzer.clreplaced", replacedyzerClreplaced.getName());
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClreplaced(WikipediaDatasetCreatorMapper.clreplaced);
    conf.setNumMapTasks(100);
    conf.setInputFormat(XmlInputFormat.clreplaced);
    // conf.setCombinerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
    conf.setReducerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
    conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    HadoopUtil.overwriteOutput(outPath);
    Set<String> categories = new HashSet<String>();
    for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase());
    }
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
    String categoriesStr = setStringifier.toString(categories);
    conf.set("wikipedia.categories", categoriesStr);
    client.setConf(conf);
    JobClient.runJob(conf);
}

14 View Complete Implementation : WikipediaDatasetCreatorDriver.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Run the job
 *
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, Clreplaced<? extends replacedyzer> replacedyzerClreplaced) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.clreplaced);
    if (log.isInfoEnabled()) {
        log.info("Input: {} Out: {} Categories: {}", new Object[] { input, output, catFile });
    }
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
    conf.set("xmlinput.end", "</text>");
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(Text.clreplaced);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("replacedyzer.clreplaced", replacedyzerClreplaced.getName());
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClreplaced(WikipediaDatasetCreatorMapper.clreplaced);
    conf.setNumMapTasks(100);
    conf.setInputFormat(XmlInputFormat.clreplaced);
    // conf.setCombinerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
    conf.setReducerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
    conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.clreplaced);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    HadoopUtil.overwriteOutput(outPath);
    Set<String> categories = new HashSet<String>();
    for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
    }
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
    String categoriesStr = setStringifier.toString(categories);
    conf.set("wikipedia.categories", categoriesStr);
    client.setConf(conf);
    JobClient.runJob(conf);
}

14 View Complete Implementation : PartialBuilder.java
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException {
    FileInputFormat.setInputPaths(job, getDataPath());
    FileOutputFormat.setOutputPath(job, getOutputPath(job));
    job.setOutputKeyClreplaced(TreeID.clreplaced);
    job.setOutputValueClreplaced(MapredOutput.clreplaced);
    job.setMapperClreplaced(Step1Mapper.clreplaced);
    // no reducers
    job.setNumReduceTasks(0);
    job.setInputFormat(TextInputFormat.clreplaced);
    job.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    // if we are in 'local' mode, correct the number of maps
    // or the mappers won't be able to compute the right indexes
    String tracker = job.get("mapred.job.tracker", "local");
    if ("local".equals(tracker)) {
        log.warn("Hadoop running in 'local' mode, only one map task will be launched");
        job.setNumMapTasks(1);
    }
}

13 View Complete Implementation : CDMahoutEvaluator.java
Copyright Apache License 2.0
Author : ogrisel
/**
 * Configure the job
 *
 * @param conf Job to configure
 * @param rules clreplacedification rules to evaluate
 * @param target label value to evaluate the rules for
 * @param inpath input path (the dataset)
 * @param outpath output <code>Path</code>
 * @param split DatasetSplit used to separate training and testing input
 */
private static void configureJob(JobConf conf, List<? extends Rule> rules, int target, Path inpath, Path outpath, DatasetSplit split) {
    split.storeJobParameters(conf);
    FileInputFormat.setInputPaths(conf, inpath);
    FileOutputFormat.setOutputPath(conf, outpath);
    conf.setOutputKeyClreplaced(LongWritable.clreplaced);
    conf.setOutputValueClreplaced(CDFitness.clreplaced);
    conf.setMapperClreplaced(CDMapper.clreplaced);
    conf.setCombinerClreplaced(CDReducer.clreplaced);
    conf.setReducerClreplaced(CDReducer.clreplaced);
    conf.setInputFormat(DatasetTextInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    // store the parameters
    conf.set(CDMapper.CLreplacedDISCOVERY_RULES, StringUtils.toString(rules));
    conf.set(CDMapper.CLreplacedDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
    conf.setInt(CDMapper.CLreplacedDISCOVERY_TARGET_LABEL, target);
}

13 View Complete Implementation : CDMahoutEvaluator.java
Copyright Apache License 2.0
Author : sisirkoppaka
/**
 * Configure the job
 *
 * @param conf Job to configure
 * @param rules clreplacedification rules to evaluate
 * @param target label value to evaluate the rules for
 * @param inpath input path (the dataset)
 * @param outpath output <code>Path</code>
 * @param split DatasetSplit used to separate training and testing input
 */
private static void configureJob(JobConf conf, List<? extends Rule> rules, int target, Path inpath, Path outpath, DatasetSplit split) {
    split.storeJobParameters(conf);
    FileInputFormat.setInputPaths(conf, inpath);
    FileOutputFormat.setOutputPath(conf, outpath);
    conf.setOutputKeyClreplaced(LongWritable.clreplaced);
    conf.setOutputValueClreplaced(CDFitness.clreplaced);
    conf.setMapperClreplaced(CDMapper.clreplaced);
    conf.setCombinerClreplaced(CDReducer.clreplaced);
    conf.setReducerClreplaced(CDReducer.clreplaced);
    conf.setInputFormat(DatasetTextInputFormat.clreplaced);
    conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
    // store the parameters
    conf.set(CDMapper.CLreplacedDISCOVERY_RULES, StringUtils.toString(rules));
    conf.set(CDMapper.CLreplacedDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
    conf.setInt(CDMapper.CLreplacedDISCOVERY_TARGET_LABEL, target);
}

13 View Complete Implementation : DataGenerator.java
Copyright Apache License 2.0
Author : facebookarchive
/**
 * Read file structure file under the input directory. Create each file
 * under the specified root. The file names are relative to the root.
 */
private void genFiles() throws IOException {
    // 
    // BufferedReader in = new BufferedReader(new FileReader(new File(inDir,
    // StructureGenerator.FILE_STRUCTURE_FILE_NAME)));
    // String line;
    // while ((line = in.readLine()) != null) {
    // String[] tokens = line.split(" ");
    // if (tokens.length != 2) {
    // throw new IOException("Expect at most 2 tokens per line: "
    // + line);
    // }
    // String fileName = root + tokens[0];
    // long fileSize = (long) (BLOCK_SIZE * Double.parseDouble(tokens[1]));
    // genFile(new Path(fileName), fileSize);
    // }
    config = new Configuration(getConf());
    config.setInt("dfs.replication", 3);
    config.set("dfs.rootdir", root.toString());
    JobConf job = new JobConf(config, DataGenerator.clreplaced);
    job.setJobName("data-genarator");
    FileOutputFormat.setOutputPath(job, new Path("data-generator-result"));
    // create the input for the map-reduce job
    Path inputPath = new Path(ROOT + "load_input");
    fs.mkdirs(inputPath);
    fs.copyFromLocalFile(new Path(inDir + "/" + StructureGenerator.FILE_STRUCTURE_FILE_NAME), inputPath);
    FileInputFormat.setInputPaths(job, new Path(ROOT + "load_input"));
    job.setInputFormat(TextInputFormat.clreplaced);
    job.setOutputKeyClreplaced(Text.clreplaced);
    job.setOutputValueClreplaced(Text.clreplaced);
    job.setMapperClreplaced(CreateFiles.clreplaced);
    job.setNumMapTasks(nFiles / 10);
    job.setNumReduceTasks(0);
    JobClient.runJob(job);
}

13 View Complete Implementation : DFSGeneralTest.java
Copyright Apache License 2.0
Author : facebookarchive
private void updateJobConf(JobConf conf, Path inputPath, Path outputPath) {
    // set specific job config
    conf.setLong(NUMBER_OF_MAPS_KEY, nmaps);
    conf.setLong(NUMBER_OF_THREADS_KEY, nthreads);
    conf.setInt(BUFFER_SIZE_KEY, buffersize);
    conf.setLong(WRITER_DATARATE_KEY, datarate);
    conf.setLong("mapred.task.timeout", Long.MAX_VALUE);
    conf.set(OUTPUT_DIR_KEY, output);
    // set the output and input for the map reduce
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);
    conf.setInputFormat(SequenceFileInputFormat.clreplaced);
    conf.setOutputKeyClreplaced(Text.clreplaced);
    conf.setOutputValueClreplaced(Text.clreplaced);
    conf.setNumReduceTasks(1);
    conf.setSpeculativeExecution(false);
}

13 View Complete Implementation : NNBench.java
Copyright Apache License 2.0
Author : facebookarchive
/**
 * Run the test
 *
 * @throws IOException on error
 */
public static void runTests(Configuration config) throws IOException {
    config.setLong("io.bytes.per.checksum", bytesPerChecksum);
    JobConf job = new JobConf(config, NNBench.clreplaced);
    job.setJobName("NNBench-" + operation);
    FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
    job.setInputFormat(SequenceFileInputFormat.clreplaced);
    // Explicitly set number of max map attempts to 1.
    job.setMaxMapAttempts(1);
    // Explicitly turn off speculative execution
    job.setSpeculativeExecution(false);
    job.setMapperClreplaced(NNBenchMapper.clreplaced);
    job.setReducerClreplaced(NNBenchReducer.clreplaced);
    FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
    job.setOutputKeyClreplaced(Text.clreplaced);
    job.setOutputValueClreplaced(Text.clreplaced);
    job.setNumReduceTasks((int) numberOfReduces);
    JobClient.runJob(job);
}