Here are the examples of the java api org.apache.hadoop.mapred.FileOutputFormat.setOutputPath() taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
155 Examples
17
View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Count the doreplacedent frequencies of features in parallel using Map/Reduce. The input doreplacedents have to be
* in {@link SequenceFile} format
*/
private static void startDFCounting(Path input, Path output) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(TFIDFConverter.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName("VectorTfIdf Doreplacedent Frequency Count running over input: " + input.toString());
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(LongWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(TermDoreplacedentCountMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(TermDoreplacedentCountReducer.clreplaced);
conf.setReducerClreplaced(TermDoreplacedentCountReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
17
View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Count the doreplacedent frequencies of features in parallel using Map/Reduce. The input doreplacedents have to be
* in {@link SequenceFile} format
*/
private static void startDFCounting(Path input, Path output) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(TFIDFConverter.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName("VectorTfIdf Doreplacedent Frequency Count running over input: " + input.toString());
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(LongWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(TermDoreplacedentCountMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(TermDoreplacedentCountReducer.clreplaced);
conf.setReducerClreplaced(TermDoreplacedentCountReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : InMemBuilder.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
@Override
protected void configureJob(JobConf conf, int nbTrees, boolean oobEstimate) throws IOException {
FileOutputFormat.setOutputPath(conf, getOutputPath(conf));
// put the data in the DistributedCache
DistributedCache.addCacheFile(getDataPath().toUri(), conf);
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(MapredOutput.clreplaced);
conf.setMapperClreplaced(InMemMapper.clreplaced);
// no reducers
conf.setNumReduceTasks(0);
conf.setInputFormat(InMemInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
}
16
View Complete Implementation : InMemBuilder.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
protected void configureJob(JobConf conf, int nbTrees, boolean oobEstimate) throws IOException {
FileOutputFormat.setOutputPath(conf, getOutputPath(conf));
// put the data in the DistributedCache
DistributedCache.addCacheFile(getDataPath().toUri(), conf);
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(MapredOutput.clreplaced);
conf.setMapperClreplaced(InMemMapper.clreplaced);
// no reducers
conf.setNumReduceTasks(0);
conf.setInputFormat(InMemInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
}
16
View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
* be in the {@link SequenceFile} format
*
* @param input
* input directory of the vectors in {@link SequenceFile} format
* @param featureCount
* Number of unique features in the dataset
* @param vectorCount
* Number of vectors in the dataset
* @param minDf
* The minimum doreplacedent frequency. Default 1
* @param maxDFPercent
* The max percentage of vectors for the DF. Can be used to remove really high frequency features.
* Expressed as an integer between 0 and 100. Default 99
* @param dictionaryFilePath
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
* @throws IOException
*/
private static void makePartialVectors(Path input, Long featureCount, Long vectorCount, int minDf, int maxDFPercent, Path dictionaryFilePath, Path output, boolean sequentialAccess) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(TFIDFConverter.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
conf.setLong(FEATURE_COUNT, featureCount);
conf.setLong(VECTOR_COUNT, vectorCount);
conf.setInt(MIN_DF, minDf);
conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setReducerClreplaced(TFIDFPartialVectorReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
public static void runJob(Path input, Path output) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setMapperClreplaced(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.clreplaced);
conf.setReducerClreplaced(Reducer.clreplaced);
conf.setNumReduceTasks(0);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : CDbwDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Run the job using supplied arguments
*
* @param input
* the directory pathname for input points
* @param stateIn
* the directory pathname for input state
* @param stateOut
* the directory pathname for output state
* @param distanceMeasureClreplaced
* the clreplaced name of the DistanceMeasure clreplaced
* @param numReducers
* the number of Reducers desired
*/
public static void runIteration(Path input, Path stateIn, Path stateOut, String distanceMeasureClreplaced, int numReducers) {
Configurable client = new JobClient();
JobConf conf = new JobConf(CDbwDriver.clreplaced);
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
conf.setMapOutputKeyClreplaced(IntWritable.clreplaced);
conf.setMapOutputValueClreplaced(WeightedVectorWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, stateOut);
conf.setMapperClreplaced(CDbwMapper.clreplaced);
conf.setReducerClreplaced(CDbwReducer.clreplaced);
conf.setNumReduceTasks(numReducers);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set(STATE_IN_KEY, stateIn.toString());
conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClreplaced);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Run an iteration
*
* @param input
* the input pathname String
* @param output
* the output pathname String
* @param control
* the control path
* @param measureClreplacedName
* the DistanceMeasure clreplaced name
* @param t1
* the T1 distance threshold
* @param t2
* the T2 distance threshold
* @param convergenceDelta
* the double convergence criteria
*/
static void runIteration(Path input, Path output, Path control, String measureClreplacedName, double t1, double t2, double convergenceDelta) {
Configurable client = new JobClient();
JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(MeanShiftCanopyMapper.clreplaced);
conf.setReducerClreplaced(MeanShiftCanopyReducer.clreplaced);
conf.setNumReduceTasks(1);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClreplacedName);
conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control.toString());
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Run the job using supplied arguments
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for input clusters
* @param output
* the directory pathname for output clustered points
*/
static void runClustering(Path input, Path clustersIn, Path output) {
JobConf conf = new JobConf(FuzzyKMeansDriver.clreplaced);
conf.setJobName("Mean Shift Clustering");
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(WeightedVectorWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(MeanShiftCanopyClusterMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
// uncomment it to run locally
// conf.set("mapred.job.tracker", "local");
conf.setNumReduceTasks(0);
conf.set(STATE_IN_KEY, clustersIn.toString());
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : TestAppendStress.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
private void startAppendJob(Configuration conf) throws IOException {
JobConf job = new JobConf(conf, TestAppendStress.clreplaced);
job.set(JOB_START_TIME_LABEL, new Date().toString());
FileInputFormat.setInputPaths(job, CONTROL_DIR);
FileOutputFormat.setOutputPath(job, APPEND_DIR);
job.setInputFormat(SequenceFileInputFormat.clreplaced);
job.setMapperClreplaced(AppendMapper.clreplaced);
job.setOutputKeyClreplaced(Text.clreplaced);
job.setOutputValueClreplaced(Text.clreplaced);
job.setNumReduceTasks(0);
JobClient.runJob(job);
}
16
View Complete Implementation : CDbwDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Run the job using supplied arguments
*
* @param input
* the directory pathname for input points
* @param stateIn
* the directory pathname for input state
* @param stateOut
* the directory pathname for output state
* @param distanceMeasureClreplaced
* the clreplaced name of the DistanceMeasure clreplaced
* @param numReducers
* the number of Reducers desired
*/
public static void runIteration(Path input, Path stateIn, Path stateOut, String distanceMeasureClreplaced, int numReducers) {
Configurable client = new JobClient();
JobConf conf = new JobConf(CDbwDriver.clreplaced);
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
conf.setMapOutputKeyClreplaced(IntWritable.clreplaced);
conf.setMapOutputValueClreplaced(WeightedVectorWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, stateOut);
conf.setMapperClreplaced(CDbwMapper.clreplaced);
conf.setReducerClreplaced(CDbwReducer.clreplaced);
conf.setNumReduceTasks(numReducers);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set(STATE_IN_KEY, stateIn.toString());
conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClreplaced);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Count the frequencies of words in parallel using Map/Reduce. The input doreplacedents have to be in
* {@link SequenceFile} format
*/
private static void startWordCounting(Path input, Path output, int minSupport) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input.toString());
conf.setInt(MIN_SUPPORT, minSupport);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(LongWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(TermCountMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(TermCountReducer.clreplaced);
conf.setReducerClreplaced(TermCountReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
public static void runJob(Path input, Path output, String vectorClreplacedName) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(InputDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("vector.implementation.clreplaced.name", vectorClreplacedName);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(InputMapper.clreplaced);
conf.setReducerClreplaced(Reducer.clreplaced);
conf.setNumReduceTasks(0);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
public static void runJob(Path input, Path output) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setMapperClreplaced(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.clreplaced);
conf.setReducerClreplaced(Reducer.clreplaced);
conf.setNumReduceTasks(0);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* preplaced2: perform the LLR calculation
*/
public static void computeNGramsPruneByLLR(long nGramTotal, Path output, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException {
JobConf conf = new JobConf(CollocDriver.clreplaced);
conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".computeNGrams: " + output);
conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
conf.setMapOutputKeyClreplaced(Gram.clreplaced);
conf.setMapOutputValueClreplaced(Gram.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setReducerClreplaced(LLRReducer.clreplaced);
conf.setNumReduceTasks(reduceTasks);
conf.setFloat(LLRReducer.MIN_LLR, minLLRValue);
JobClient.runJob(conf);
}
16
View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Count the frequencies of words in parallel using Map/Reduce. The input doreplacedents have to be in
* {@link SequenceFile} format
*/
private static void startWordCounting(Path input, Path output, int minSupport) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input.toString());
conf.setInt(MIN_SUPPORT, minSupport);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(LongWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(TermCountMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(TermCountReducer.clreplaced);
conf.setReducerClreplaced(TermCountReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Run an iteration
*
* @param input
* the input pathname String
* @param output
* the output pathname String
* @param control
* the control path
* @param measureClreplacedName
* the DistanceMeasure clreplaced name
* @param t1
* the T1 distance threshold
* @param t2
* the T2 distance threshold
* @param convergenceDelta
* the double convergence criteria
*/
static void runIteration(Path input, Path output, Path control, String measureClreplacedName, double t1, double t2, double convergenceDelta) {
Configurable client = new JobClient();
JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(MeanShiftCanopyMapper.clreplaced);
conf.setReducerClreplaced(MeanShiftCanopyReducer.clreplaced);
conf.setNumReduceTasks(1);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClreplacedName);
conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control.toString());
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : TFIDFConverter.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
* be in the {@link SequenceFile} format
*
* @param input
* input directory of the vectors in {@link SequenceFile} format
* @param featureCount
* Number of unique features in the dataset
* @param vectorCount
* Number of vectors in the dataset
* @param minDf
* The minimum doreplacedent frequency. Default 1
* @param maxDFPercent
* The max percentage of vectors for the DF. Can be used to remove really high frequency features.
* Expressed as an integer between 0 and 100. Default 99
* @param dictionaryFilePath
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
* @throws IOException
*/
private static void makePartialVectors(Path input, Long featureCount, Long vectorCount, int minDf, int maxDFPercent, Path dictionaryFilePath, Path output, boolean sequentialAccess) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(TFIDFConverter.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
conf.setLong(FEATURE_COUNT, featureCount);
conf.setLong(VECTOR_COUNT, vectorCount);
conf.setInt(MIN_DF, minDf);
conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setReducerClreplaced(TFIDFPartialVectorReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
static void createCanopyFromVectors(Path input, Path output) {
Configurable client = new JobClient();
JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(MeanShiftCanopyCreatorMapper.clreplaced);
conf.setNumReduceTasks(0);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Run the job using supplied arguments
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for input clusters
* @param output
* the directory pathname for output clustered points
*/
static void runClustering(Path input, Path clustersIn, Path output) {
JobConf conf = new JobConf(FuzzyKMeansDriver.clreplaced);
conf.setJobName("Mean Shift Clustering");
conf.setOutputKeyClreplaced(IntWritable.clreplaced);
conf.setOutputValueClreplaced(WeightedVectorWritable.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(MeanShiftCanopyClusterMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
// uncomment it to run locally
// conf.set("mapred.job.tracker", "local");
conf.setNumReduceTasks(0);
conf.set(STATE_IN_KEY, clustersIn.toString());
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : MeanShiftCanopyDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
static void createCanopyFromVectors(Path input, Path output) {
Configurable client = new JobClient();
JobConf conf = new JobConf(MeanShiftCanopyDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(MeanShiftCanopy.clreplaced);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(MeanShiftCanopyCreatorMapper.clreplaced);
conf.setNumReduceTasks(0);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
16
View Complete Implementation : InputDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
public static void runJob(Path input, Path output, String vectorClreplacedName) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(InputDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("vector.implementation.clreplaced.name", vectorClreplacedName);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(InputMapper.clreplaced);
conf.setReducerClreplaced(Reducer.clreplaced);
conf.setNumReduceTasks(0);
client.setConf(conf);
JobClient.runJob(conf);
}
16
View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* preplaced2: perform the LLR calculation
*/
public static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException {
JobConf conf = new JobConf(baseConf, CollocDriver.clreplaced);
conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".computeNGrams: " + output);
conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
conf.setMapOutputKeyClreplaced(Gram.clreplaced);
conf.setMapOutputValueClreplaced(Gram.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setReducerClreplaced(LLRReducer.clreplaced);
conf.setNumReduceTasks(reduceTasks);
conf.setFloat(LLRReducer.MIN_LLR, minLLRValue);
JobClient.runJob(conf);
}
15
View Complete Implementation : OutputDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
public static void runJob(String input, String output) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(Text.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
conf.setMapperClreplaced(OutputMapper.clreplaced);
conf.setReducerClreplaced(Reducer.clreplaced);
conf.setNumReduceTasks(0);
client.setConf(conf);
JobClient.runJob(conf);
}
15
View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Create a partial vector using a chunk of features from the input doreplacedents. The input doreplacedents has to be
* in the {@link SequenceFile} format
*
* @param input
* input directory of the doreplacedents in {@link SequenceFile} format
* @param maxNGramSize
* maximum size of ngrams to generate
* @param dictionaryFilePath
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
* @param numReducers
* the desired number of reducer tasks
* @throws IOException
*/
private static void makePartialVectors(Path input, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, int numReducers) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
conf.setInt(PartialVectorMerger.DIMENSION, dimension);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
conf.setInt(MAX_NGRAMS, maxNGramSize);
conf.setMapOutputKeyClreplaced(Text.clreplaced);
conf.setMapOutputValueClreplaced(StringTuple.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setReducerClreplaced(TFPartialVectorReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setNumReduceTasks(numReducers);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
15
View Complete Implementation : DictionaryVectorizer.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Create a partial vector using a chunk of features from the input doreplacedents. The input doreplacedents has to be
* in the {@link SequenceFile} format
*
* @param input
* input directory of the doreplacedents in {@link SequenceFile} format
* @param maxNGramSize
* maximum size of ngrams to generate
* @param dictionaryFilePath
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
* @param numReducers
* the desired number of reducer tasks
* @throws IOException
*/
private static void makePartialVectors(Path input, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, int numReducers) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
conf.setInt(PartialVectorMerger.DIMENSION, dimension);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
conf.setInt(MAX_NGRAMS, maxNGramSize);
conf.setMapOutputKeyClreplaced(Text.clreplaced);
conf.setMapOutputValueClreplaced(StringTuple.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(VectorWritable.clreplaced);
DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);
FileInputFormat.setInputPaths(conf, input);
FileOutputFormat.setOutputPath(conf, output);
conf.setMapperClreplaced(IdenreplacedyMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setReducerClreplaced(TFPartialVectorReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setNumReduceTasks(numReducers);
HadoopUtil.overwriteOutput(output);
client.setConf(conf);
JobClient.runJob(conf);
}
15
View Complete Implementation : WikipediaToSequenceFile.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Run the job
*
* @param input
* the input pathname String
* @param output
* the output pathname String
* @param catFile
* the file containing the Wikipedia categories
* @param exactMatchOnly
* if true, then the Wikipedia category must match exactly instead of simply containing the
* category string
* @param all
* if true select all categories
*/
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(WikipediaToSequenceFile.clreplaced);
if (WikipediaToSequenceFile.log.isInfoEnabled()) {
log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
}
conf.set("xmlinput.start", "<page>");
conf.set("xmlinput.end", "</page>");
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(Text.clreplaced);
conf.setBoolean("exact.match.only", exactMatchOnly);
conf.setBoolean("all.files", all);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClreplaced(WikipediaMapper.clreplaced);
conf.setInputFormat(XmlInputFormat.clreplaced);
conf.setReducerClreplaced(IdenreplacedyReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
/*
* conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
* "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
* "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
*/
HadoopUtil.overwriteOutput(outPath);
Set<String> categories = new HashSet<String>();
if (catFile.length() > 0) {
for (String line : new FileLineIterable(new File(catFile))) {
categories.add(line.trim().toLowerCase(Locale.ENGLISH));
}
}
DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
String categoriesStr = setStringifier.toString(categories);
conf.set("wikipedia.categories", categoriesStr);
client.setConf(conf);
JobClient.runJob(conf);
}
15
View Complete Implementation : WikipediaToSequenceFile.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Run the job
*
* @param input
* the input pathname String
* @param output
* the output pathname String
* @param catFile
* the file containing the Wikipedia categories
* @param exactMatchOnly
* if true, then the Wikipedia category must match exactly instead of simply containing the
* category string
* @param all
* if true select all categories
*/
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(WikipediaToSequenceFile.clreplaced);
if (WikipediaToSequenceFile.log.isInfoEnabled()) {
log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
}
conf.set("xmlinput.start", "<page>");
conf.set("xmlinput.end", "</page>");
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(Text.clreplaced);
conf.setBoolean("exact.match.only", exactMatchOnly);
conf.setBoolean("all.files", all);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClreplaced(WikipediaMapper.clreplaced);
conf.setInputFormat(XmlInputFormat.clreplaced);
conf.setReducerClreplaced(IdenreplacedyReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
/*
* conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
* "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
* "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
*/
HadoopUtil.overwriteOutput(outPath);
Set<String> categories = new HashSet<String>();
if (catFile.length() > 0) {
for (String line : new FileLineIterable(new File(catFile))) {
categories.add(line.trim().toLowerCase());
}
}
DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
String categoriesStr = setStringifier.toString(categories);
conf.set("wikipedia.categories", categoriesStr);
client.setConf(conf);
JobClient.runJob(conf);
}
15
View Complete Implementation : TxnGenerator.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
private static JobConf createJobConf(Configuration conf) throws IOException {
JobConf jobConf = new JobConf(conf);
String jobName = "transaction_generator";
jobConf.setJobName(jobName);
String splitDir = workplace + "split/";
jobConf.set(TEST_DIR_LABEL, workplace);
jobConf.setMapSpeculativeExecution(false);
jobConf.setJarByClreplaced(TxnGenerator.clreplaced);
jobConf.setMapperClreplaced(GeneratorMapper.clreplaced);
jobConf.setInputFormat(TextInputFormat.clreplaced);
FileInputFormat.addInputPath(jobConf, new Path(splitDir));
Random random = new Random();
FileOutputFormat.setOutputPath(jobConf, new Path(workplace, "output" + random.nextLong()));
jobConf.setNumReduceTasks(0);
jobConf.setNumMapTasks(numMappers);
createSplitFiles(conf, new Path(splitDir));
return jobConf;
}
15
View Complete Implementation : TestFileSystem.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
public static void readTest(FileSystem fs, boolean fastCheck) throws Exception {
fs.delete(READ_DIR, true);
JobConf job = new JobConf(conf, TestFileSystem.clreplaced);
job.setBoolean("fs.test.fastCheck", fastCheck);
FileInputFormat.setInputPaths(job, CONTROL_DIR);
job.setInputFormat(SequenceFileInputFormat.clreplaced);
job.setMapperClreplaced(ReadMapper.clreplaced);
job.setReducerClreplaced(LongSumReducer.clreplaced);
FileOutputFormat.setOutputPath(job, READ_DIR);
job.setOutputKeyClreplaced(UTF8.clreplaced);
job.setOutputValueClreplaced(LongWritable.clreplaced);
job.setNumReduceTasks(1);
JobClient.runJob(job);
}
15
View Complete Implementation : TestFileSystem.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
public static void writeTest(FileSystem fs, boolean fastCheck) throws Exception {
fs.delete(DATA_DIR, true);
fs.delete(WRITE_DIR, true);
JobConf job = new JobConf(conf, TestFileSystem.clreplaced);
job.setBoolean("fs.test.fastCheck", fastCheck);
FileInputFormat.setInputPaths(job, CONTROL_DIR);
job.setInputFormat(SequenceFileInputFormat.clreplaced);
job.setMapperClreplaced(WriteMapper.clreplaced);
job.setReducerClreplaced(LongSumReducer.clreplaced);
FileOutputFormat.setOutputPath(job, WRITE_DIR);
job.setOutputKeyClreplaced(UTF8.clreplaced);
job.setOutputValueClreplaced(LongWritable.clreplaced);
job.setNumReduceTasks(1);
JobClient.runJob(job);
}
15
View Complete Implementation : ExternalMapReduce.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
public int run(String[] argv) throws IOException {
if (argv.length < 2) {
System.out.println("ExternalMapReduce <input> <output>");
return -1;
}
Path outDir = new Path(argv[1]);
Path input = new Path(argv[0]);
JobConf testConf = new JobConf(getConf(), ExternalMapReduce.clreplaced);
// try to load a clreplaced from libjar
try {
testConf.getClreplacedByName("testjar.ClreplacedWordCount");
} catch (ClreplacedNotFoundException e) {
System.out.println("Could not find clreplaced from libjar");
return -1;
}
testConf.setJobName("external job");
FileInputFormat.setInputPaths(testConf, input);
FileOutputFormat.setOutputPath(testConf, outDir);
testConf.setMapperClreplaced(MapClreplaced.clreplaced);
testConf.setReducerClreplaced(Reduce.clreplaced);
testConf.setNumReduceTasks(1);
JobClient.runJob(testConf);
return 0;
}
15
View Complete Implementation : TestFileSystem.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
public static void seekTest(FileSystem fs, boolean fastCheck) throws Exception {
fs.delete(READ_DIR, true);
JobConf job = new JobConf(conf, TestFileSystem.clreplaced);
job.setBoolean("fs.test.fastCheck", fastCheck);
FileInputFormat.setInputPaths(job, CONTROL_DIR);
job.setInputFormat(SequenceFileInputFormat.clreplaced);
job.setMapperClreplaced(SeekMapper.clreplaced);
job.setReducerClreplaced(LongSumReducer.clreplaced);
FileOutputFormat.setOutputPath(job, READ_DIR);
job.setOutputKeyClreplaced(UTF8.clreplaced);
job.setOutputValueClreplaced(LongWritable.clreplaced);
job.setNumReduceTasks(1);
JobClient.runJob(job);
}
15
View Complete Implementation : OutputDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
public static void runJob(String input, String output) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(Text.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
conf.setMapperClreplaced(OutputMapper.clreplaced);
conf.setReducerClreplaced(Reducer.clreplaced);
conf.setNumReduceTasks(0);
client.setConf(conf);
JobClient.runJob(conf);
}
14
View Complete Implementation : BayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(BayesThetaNormalizerDriver.clreplaced);
conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);
conf.setOutputKeyClreplaced(StringTuple.clreplaced);
conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
Path outPath = new Path(output, "trainer-thetaNormalizer");
FileOutputFormat.setOutputPath(conf, outPath);
// conf.setNumMapTasks(100);
// conf.setNumReduceTasks(1);
conf.setMapperClreplaced(BayesThetaNormalizerMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(BayesThetaNormalizerReducer.clreplaced);
conf.setReducerClreplaced(BayesThetaNormalizerReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// Dont ever forget this. People should keep track of how hadoop conf
// parameters and make or break a piece of code
HadoopUtil.overwriteOutput(outPath);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
String labelWeightSumString = mapStringifier.toString(labelWeightSum);
log.info("Sigma_k for Each Label");
Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
log.info("{}", c);
conf.set("cnaivebayes.sigma_k", labelWeightSumString);
Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
log.info("Sigma_kSigma_j for each Label and for each Features");
double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
log.info("{}", retSigmaJSigmaK);
conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
String vocabCountString = stringifier.toString(vocabCount);
log.info("Vocabulary Count");
conf.set("cnaivebayes.vocabCount", vocabCountString);
double retvocabCount = stringifier.fromString(vocabCountString);
log.info("{}", retvocabCount);
conf.set("bayes.parameters", params.toString());
conf.set("output.table", output.toString());
client.setConf(conf);
JobClient.runJob(conf);
}
14
View Complete Implementation : CBayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(CBayesThetaNormalizerDriver.clreplaced);
conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);
conf.setOutputKeyClreplaced(StringTuple.clreplaced);
conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j"));
FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
Path outPath = new Path(output, "trainer-thetaNormalizer");
FileOutputFormat.setOutputPath(conf, outPath);
// conf.setNumMapTasks(100);
// conf.setNumReduceTasks(1);
conf.setMapperClreplaced(CBayesThetaNormalizerMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
conf.setReducerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
// Dont ever forget this. People should keep track of how hadoop conf
// parameters and make or break a piece of code
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
HadoopUtil.overwriteOutput(outPath);
Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
String labelWeightSumString = mapStringifier.toString(labelWeightSum);
log.info("Sigma_k for Each Label");
Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
log.info("{}", c);
conf.set("cnaivebayes.sigma_k", labelWeightSumString);
Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf);
DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
log.info("Sigma_kSigma_j for each Label and for each Features");
double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
log.info("{}", retSigmaJSigmaK);
conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
String vocabCountString = stringifier.toString(vocabCount);
log.info("Vocabulary Count");
conf.set("cnaivebayes.vocabCount", vocabCountString);
double retvocabCount = stringifier.fromString(vocabCountString);
log.info("{}", retvocabCount);
conf.set("bayes.parameters", params.toString());
conf.set("output.table", output.toString());
client.setConf(conf);
JobClient.runJob(conf);
}
14
View Complete Implementation : BayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(BayesThetaNormalizerDriver.clreplaced);
conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);
conf.setOutputKeyClreplaced(StringTuple.clreplaced);
conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
Path outPath = new Path(output, "trainer-thetaNormalizer");
FileOutputFormat.setOutputPath(conf, outPath);
// conf.setNumMapTasks(100);
// conf.setNumReduceTasks(1);
conf.setMapperClreplaced(BayesThetaNormalizerMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(BayesThetaNormalizerReducer.clreplaced);
conf.setReducerClreplaced(BayesThetaNormalizerReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// Dont ever forget this. People should keep track of how hadoop conf
// parameters and make or break a piece of code
HadoopUtil.overwriteOutput(outPath);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
String labelWeightSumString = mapStringifier.toString(labelWeightSum);
log.info("Sigma_k for Each Label");
Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
log.info("{}", c);
conf.set("cnaivebayes.sigma_k", labelWeightSumString);
Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
log.info("Sigma_kSigma_j for each Label and for each Features");
double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
log.info("{}", retSigmaJSigmaK);
conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
String vocabCountString = stringifier.toString(vocabCount);
log.info("Vocabulary Count");
conf.set("cnaivebayes.vocabCount", vocabCountString);
double retvocabCount = stringifier.fromString(vocabCountString);
log.info("{}", retvocabCount);
conf.set("bayes.parameters", params.toString());
conf.set("output.table", output.toString());
client.setConf(conf);
JobClient.runJob(conf);
}
14
View Complete Implementation : CBayesThetaNormalizerDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(CBayesThetaNormalizerDriver.clreplaced);
conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);
conf.setOutputKeyClreplaced(StringTuple.clreplaced);
conf.setOutputValueClreplaced(DoubleWritable.clreplaced);
FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j"));
FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
Path outPath = new Path(output, "trainer-thetaNormalizer");
FileOutputFormat.setOutputPath(conf, outPath);
// conf.setNumMapTasks(100);
// conf.setNumReduceTasks(1);
conf.setMapperClreplaced(CBayesThetaNormalizerMapper.clreplaced);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setCombinerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
conf.setReducerClreplaced(CBayesThetaNormalizerReducer.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
// Dont ever forget this. People should keep track of how hadoop conf
// parameters and make or break a piece of code
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
HadoopUtil.overwriteOutput(outPath);
Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClreplaced(labelWeightSum));
String labelWeightSumString = mapStringifier.toString(labelWeightSum);
log.info("Sigma_k for Each Label");
Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
log.info("{}", c);
conf.set("cnaivebayes.sigma_k", labelWeightSumString);
Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf);
DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.clreplaced);
String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
log.info("Sigma_kSigma_j for each Label and for each Features");
double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
log.info("{}", retSigmaJSigmaK);
conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
String vocabCountString = stringifier.toString(vocabCount);
log.info("Vocabulary Count");
conf.set("cnaivebayes.vocabCount", vocabCountString);
double retvocabCount = stringifier.fromString(vocabCountString);
log.info("{}", retvocabCount);
conf.set("bayes.parameters", params.toString());
conf.set("output.table", output.toString());
client.setConf(conf);
JobClient.runJob(conf);
}
14
View Complete Implementation : PartialBuilder.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
@Override
protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException {
FileInputFormat.setInputPaths(job, getDataPath());
FileOutputFormat.setOutputPath(job, getOutputPath(job));
job.setOutputKeyClreplaced(TreeID.clreplaced);
job.setOutputValueClreplaced(MapredOutput.clreplaced);
job.setMapperClreplaced(Step1Mapper.clreplaced);
// no reducers
job.setNumReduceTasks(0);
job.setInputFormat(TextInputFormat.clreplaced);
job.setOutputFormat(SequenceFileOutputFormat.clreplaced);
// if we are in 'local' mode, correct the number of maps
// or the mappers won't be able to compute the right indexes
String tracker = job.get("mapred.job.tracker", "local");
if ("local".equals(tracker)) {
log.warn("Hadoop running in 'local' mode, only one map task will be launched");
job.setNumMapTasks(1);
}
}
14
View Complete Implementation : TxnConsumer.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
private JobConf createJobConf(Configuration conf2) throws IOException {
JobConf jobConf = new JobConf(conf);
String jobName = "transaction_consumer";
jobConf.setJobName(jobName);
String splitDir = workplace + "split/";
jobConf.set(TEST_DIR_LABEL, workplace);
jobConf.set(NOTIFIER_SERVER_ADDR_KEY, notifierServerAddrStr);
jobConf.set(NOTIFIER_SERVER_PORT_KEY, notifierServerPortStr);
jobConf.setMapSpeculativeExecution(false);
jobConf.setReduceSpeculativeExecution(false);
jobConf.setJarByClreplaced(TxnConsumer.clreplaced);
jobConf.setMapperClreplaced(ConsumerMapper.clreplaced);
jobConf.setReducerClreplaced(ConsumerReducer.clreplaced);
jobConf.setMapOutputKeyClreplaced(Text.clreplaced);
jobConf.setMapOutputValueClreplaced(Text.clreplaced);
jobConf.setOutputKeyClreplaced(Text.clreplaced);
jobConf.setOutputValueClreplaced(Text.clreplaced);
jobConf.setInputFormat(TextInputFormat.clreplaced);
jobConf.setOutputFormat(TextOutputFormat.clreplaced);
FileInputFormat.addInputPath(jobConf, new Path(splitDir));
Random random = new Random();
FileOutputFormat.setOutputPath(jobConf, new Path(workplace, "output" + random.nextLong()));
jobConf.setNumMapTasks(numMappers);
createSplitFiles(conf, new Path(splitDir));
return jobConf;
}
14
View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* preplaced1: generate collocations, ngrams
*/
public static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException {
JobConf conf = new JobConf(baseConf, CollocDriver.clreplaced);
conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".generateCollocations:" + input);
conf.setMapOutputKeyClreplaced(GramKey.clreplaced);
conf.setMapOutputValueClreplaced(Gram.clreplaced);
conf.setParreplacedionerClreplaced(GramKeyParreplacedioner.clreplaced);
conf.setOutputValueGroupingComparator(GramKeyGroupComparator.clreplaced);
conf.setOutputKeyClreplaced(Gram.clreplaced);
conf.setOutputValueClreplaced(Gram.clreplaced);
conf.setCombinerClreplaced(CollocCombiner.clreplaced);
conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
FileInputFormat.setInputPaths(conf, input);
Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
FileOutputFormat.setOutputPath(conf, outputPath);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setMapperClreplaced(CollocMapper.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setReducerClreplaced(CollocReducer.clreplaced);
conf.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
conf.setInt(CollocReducer.MIN_SUPPORT, minSupport);
conf.setNumReduceTasks(reduceTasks);
RunningJob job = JobClient.runJob(conf);
return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}
14
View Complete Implementation : CollocDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* preplaced1: generate collocations, ngrams
*/
public static long generateCollocations(Path input, Path output, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException {
JobConf conf = new JobConf(CollocDriver.clreplaced);
conf.setJobName(CollocDriver.clreplaced.getSimpleName() + ".generateCollocations:" + input);
conf.setMapOutputKeyClreplaced(GramKey.clreplaced);
conf.setMapOutputValueClreplaced(Gram.clreplaced);
conf.setParreplacedionerClreplaced(GramKeyParreplacedioner.clreplaced);
conf.setOutputValueGroupingComparator(GramKeyGroupComparator.clreplaced);
conf.setOutputKeyClreplaced(Gram.clreplaced);
conf.setOutputValueClreplaced(Gram.clreplaced);
conf.setCombinerClreplaced(CollocCombiner.clreplaced);
conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
FileInputFormat.setInputPaths(conf, input);
Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setMapperClreplaced(CollocMapper.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
conf.setReducerClreplaced(CollocReducer.clreplaced);
conf.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
conf.setInt(CollocReducer.MIN_SUPPORT, minSupport);
conf.setNumReduceTasks(reduceTasks);
RunningJob job = JobClient.runJob(conf);
return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}
14
View Complete Implementation : WikipediaDatasetCreatorDriver.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Run the job
*
* @param input
* the input pathname String
* @param output
* the output pathname String
* @param catFile
* the file containing the Wikipedia categories
* @param exactMatchOnly
* if true, then the Wikipedia category must match exactly instead of simply containing the
* category string
*/
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, Clreplaced<? extends replacedyzer> replacedyzerClreplaced) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.clreplaced);
if (WikipediaDatasetCreatorDriver.log.isInfoEnabled()) {
log.info("Input: {} Out: {} Categories: {}", new Object[] { input, output, catFile });
}
conf.set("key.value.separator.in.input.line", " ");
conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
conf.set("xmlinput.end", "</text>");
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(Text.clreplaced);
conf.setBoolean("exact.match.only", exactMatchOnly);
conf.set("replacedyzer.clreplaced", replacedyzerClreplaced.getName());
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClreplaced(WikipediaDatasetCreatorMapper.clreplaced);
conf.setNumMapTasks(100);
conf.setInputFormat(XmlInputFormat.clreplaced);
// conf.setCombinerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
conf.setReducerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
// Dont ever forget this. People should keep track of how hadoop conf
// parameters and make or break a piece of code
HadoopUtil.overwriteOutput(outPath);
Set<String> categories = new HashSet<String>();
for (String line : new FileLineIterable(new File(catFile))) {
categories.add(line.trim().toLowerCase());
}
DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
String categoriesStr = setStringifier.toString(categories);
conf.set("wikipedia.categories", categoriesStr);
client.setConf(conf);
JobClient.runJob(conf);
}
14
View Complete Implementation : WikipediaDatasetCreatorDriver.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Run the job
*
* @param input
* the input pathname String
* @param output
* the output pathname String
* @param catFile
* the file containing the Wikipedia categories
* @param exactMatchOnly
* if true, then the Wikipedia category must match exactly instead of simply containing the
* category string
*/
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, Clreplaced<? extends replacedyzer> replacedyzerClreplaced) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.clreplaced);
if (log.isInfoEnabled()) {
log.info("Input: {} Out: {} Categories: {}", new Object[] { input, output, catFile });
}
conf.set("key.value.separator.in.input.line", " ");
conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
conf.set("xmlinput.end", "</text>");
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(Text.clreplaced);
conf.setBoolean("exact.match.only", exactMatchOnly);
conf.set("replacedyzer.clreplaced", replacedyzerClreplaced.getName());
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClreplaced(WikipediaDatasetCreatorMapper.clreplaced);
conf.setNumMapTasks(100);
conf.setInputFormat(XmlInputFormat.clreplaced);
// conf.setCombinerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
conf.setReducerClreplaced(WikipediaDatasetCreatorReducer.clreplaced);
conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.clreplaced);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// Dont ever forget this. People should keep track of how hadoop conf
// parameters and make or break a piece of code
HadoopUtil.overwriteOutput(outPath);
Set<String> categories = new HashSet<String>();
for (String line : new FileLineIterable(new File(catFile))) {
categories.add(line.trim().toLowerCase(Locale.ENGLISH));
}
DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClreplaced(categories));
String categoriesStr = setStringifier.toString(categories);
conf.set("wikipedia.categories", categoriesStr);
client.setConf(conf);
JobClient.runJob(conf);
}
14
View Complete Implementation : PartialBuilder.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
@Override
protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException {
FileInputFormat.setInputPaths(job, getDataPath());
FileOutputFormat.setOutputPath(job, getOutputPath(job));
job.setOutputKeyClreplaced(TreeID.clreplaced);
job.setOutputValueClreplaced(MapredOutput.clreplaced);
job.setMapperClreplaced(Step1Mapper.clreplaced);
// no reducers
job.setNumReduceTasks(0);
job.setInputFormat(TextInputFormat.clreplaced);
job.setOutputFormat(SequenceFileOutputFormat.clreplaced);
// if we are in 'local' mode, correct the number of maps
// or the mappers won't be able to compute the right indexes
String tracker = job.get("mapred.job.tracker", "local");
if ("local".equals(tracker)) {
log.warn("Hadoop running in 'local' mode, only one map task will be launched");
job.setNumMapTasks(1);
}
}
13
View Complete Implementation : CDMahoutEvaluator.java
Copyright Apache License 2.0
Author : ogrisel
Copyright Apache License 2.0
Author : ogrisel
/**
* Configure the job
*
* @param conf Job to configure
* @param rules clreplacedification rules to evaluate
* @param target label value to evaluate the rules for
* @param inpath input path (the dataset)
* @param outpath output <code>Path</code>
* @param split DatasetSplit used to separate training and testing input
*/
private static void configureJob(JobConf conf, List<? extends Rule> rules, int target, Path inpath, Path outpath, DatasetSplit split) {
split.storeJobParameters(conf);
FileInputFormat.setInputPaths(conf, inpath);
FileOutputFormat.setOutputPath(conf, outpath);
conf.setOutputKeyClreplaced(LongWritable.clreplaced);
conf.setOutputValueClreplaced(CDFitness.clreplaced);
conf.setMapperClreplaced(CDMapper.clreplaced);
conf.setCombinerClreplaced(CDReducer.clreplaced);
conf.setReducerClreplaced(CDReducer.clreplaced);
conf.setInputFormat(DatasetTextInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
// store the parameters
conf.set(CDMapper.CLreplacedDISCOVERY_RULES, StringUtils.toString(rules));
conf.set(CDMapper.CLreplacedDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
conf.setInt(CDMapper.CLreplacedDISCOVERY_TARGET_LABEL, target);
}
13
View Complete Implementation : CDMahoutEvaluator.java
Copyright Apache License 2.0
Author : sisirkoppaka
Copyright Apache License 2.0
Author : sisirkoppaka
/**
* Configure the job
*
* @param conf Job to configure
* @param rules clreplacedification rules to evaluate
* @param target label value to evaluate the rules for
* @param inpath input path (the dataset)
* @param outpath output <code>Path</code>
* @param split DatasetSplit used to separate training and testing input
*/
private static void configureJob(JobConf conf, List<? extends Rule> rules, int target, Path inpath, Path outpath, DatasetSplit split) {
split.storeJobParameters(conf);
FileInputFormat.setInputPaths(conf, inpath);
FileOutputFormat.setOutputPath(conf, outpath);
conf.setOutputKeyClreplaced(LongWritable.clreplaced);
conf.setOutputValueClreplaced(CDFitness.clreplaced);
conf.setMapperClreplaced(CDMapper.clreplaced);
conf.setCombinerClreplaced(CDReducer.clreplaced);
conf.setReducerClreplaced(CDReducer.clreplaced);
conf.setInputFormat(DatasetTextInputFormat.clreplaced);
conf.setOutputFormat(SequenceFileOutputFormat.clreplaced);
// store the parameters
conf.set(CDMapper.CLreplacedDISCOVERY_RULES, StringUtils.toString(rules));
conf.set(CDMapper.CLreplacedDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
conf.setInt(CDMapper.CLreplacedDISCOVERY_TARGET_LABEL, target);
}
13
View Complete Implementation : DataGenerator.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
/**
* Read file structure file under the input directory. Create each file
* under the specified root. The file names are relative to the root.
*/
private void genFiles() throws IOException {
//
// BufferedReader in = new BufferedReader(new FileReader(new File(inDir,
// StructureGenerator.FILE_STRUCTURE_FILE_NAME)));
// String line;
// while ((line = in.readLine()) != null) {
// String[] tokens = line.split(" ");
// if (tokens.length != 2) {
// throw new IOException("Expect at most 2 tokens per line: "
// + line);
// }
// String fileName = root + tokens[0];
// long fileSize = (long) (BLOCK_SIZE * Double.parseDouble(tokens[1]));
// genFile(new Path(fileName), fileSize);
// }
config = new Configuration(getConf());
config.setInt("dfs.replication", 3);
config.set("dfs.rootdir", root.toString());
JobConf job = new JobConf(config, DataGenerator.clreplaced);
job.setJobName("data-genarator");
FileOutputFormat.setOutputPath(job, new Path("data-generator-result"));
// create the input for the map-reduce job
Path inputPath = new Path(ROOT + "load_input");
fs.mkdirs(inputPath);
fs.copyFromLocalFile(new Path(inDir + "/" + StructureGenerator.FILE_STRUCTURE_FILE_NAME), inputPath);
FileInputFormat.setInputPaths(job, new Path(ROOT + "load_input"));
job.setInputFormat(TextInputFormat.clreplaced);
job.setOutputKeyClreplaced(Text.clreplaced);
job.setOutputValueClreplaced(Text.clreplaced);
job.setMapperClreplaced(CreateFiles.clreplaced);
job.setNumMapTasks(nFiles / 10);
job.setNumReduceTasks(0);
JobClient.runJob(job);
}
13
View Complete Implementation : DFSGeneralTest.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
private void updateJobConf(JobConf conf, Path inputPath, Path outputPath) {
// set specific job config
conf.setLong(NUMBER_OF_MAPS_KEY, nmaps);
conf.setLong(NUMBER_OF_THREADS_KEY, nthreads);
conf.setInt(BUFFER_SIZE_KEY, buffersize);
conf.setLong(WRITER_DATARATE_KEY, datarate);
conf.setLong("mapred.task.timeout", Long.MAX_VALUE);
conf.set(OUTPUT_DIR_KEY, output);
// set the output and input for the map reduce
FileInputFormat.setInputPaths(conf, inputPath);
FileOutputFormat.setOutputPath(conf, outputPath);
conf.setInputFormat(SequenceFileInputFormat.clreplaced);
conf.setOutputKeyClreplaced(Text.clreplaced);
conf.setOutputValueClreplaced(Text.clreplaced);
conf.setNumReduceTasks(1);
conf.setSpeculativeExecution(false);
}
13
View Complete Implementation : NNBench.java
Copyright Apache License 2.0
Author : facebookarchive
Copyright Apache License 2.0
Author : facebookarchive
/**
* Run the test
*
* @throws IOException on error
*/
public static void runTests(Configuration config) throws IOException {
config.setLong("io.bytes.per.checksum", bytesPerChecksum);
JobConf job = new JobConf(config, NNBench.clreplaced);
job.setJobName("NNBench-" + operation);
FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.clreplaced);
// Explicitly set number of max map attempts to 1.
job.setMaxMapAttempts(1);
// Explicitly turn off speculative execution
job.setSpeculativeExecution(false);
job.setMapperClreplaced(NNBenchMapper.clreplaced);
job.setReducerClreplaced(NNBenchReducer.clreplaced);
FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
job.setOutputKeyClreplaced(Text.clreplaced);
job.setOutputValueClreplaced(Text.clreplaced);
job.setNumReduceTasks((int) numberOfReduces);
JobClient.runJob(job);
}