org.apache.crunch.PTable - java examples

Here are the examples of the java api org.apache.crunch.PTable taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

109 Examples 7

19 View Complete Implementation : TopListTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testTopNYbyX() {
    PTable<String, String> data = MemPipeline.typedTableOf(tableOf(strings(), strings()), "a", "x", "a", "x", "a", "x", "a", "y", "a", "y", "a", "z", "b", "x", "b", "x", "b", "z");
    Map<String, Collection<Pair<Long, String>>> actual = TopList.topNYbyX(data, 2).materializeToMap();
    Map<String, Collection<Pair<Long, String>>> expected = ImmutableMap.of("a", collectionOf(Pair.of(3L, "x"), Pair.of(2L, "y")), "b", collectionOf(Pair.of(2L, "x"), Pair.of(1L, "z")));
    replacedertEquals(expected, actual);
}

19 View Complete Implementation : SPTablesTest.java
Copyright Apache License 2.0
Author : spotify
@Test
public void testSwapKeyValue() {
    PTable<String, Long> table = MemPipeline.typedTableOf(tableOf(strings(), longs()), "hello", 14L, "goodbye", 21L);
    PTable<Long, String> actual = SPTables.swapKeyValue(table);
    Map<Long, String> expected = ImmutableMap.of(14L, "hello", 21L, "goodbye");
    replacedertEquals(expected, actual.materializeToMap());
}

19 View Complete Implementation : Sort.java
Copyright Apache License 2.0
Author : apache
/**
 * Sorts the {@code PCollection} using the natural ordering of its elements in
 * the order specified using the given number of reducers.
 *
 * @return a {@code PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, int numReducers, Order order) {
    PTypeFamily tf = collection.getTypeFamily();
    PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
    Configuration conf = collection.getPipeline().getConfiguration();
    PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {

        @Override
        public void process(T input, Emitter<Pair<T, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, order);
    return pt.groupByKey(options).ungroup().keys();
}

19 View Complete Implementation : QuantilesTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testQuantilesBetween() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 5, // We expect the 0.5 to correspond to this element, according to the "nearest rank" %ile definition.
    "a", // We expect the 0.5 to correspond to this element, according to the "nearest rank" %ile definition.
    2, "a", 4, "a", 1);
    Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(4, Pair.of(0.5, 2)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : OneToManyJoinTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testOneToMany_UnmatchedOnRightSide() {
    PTable<Integer, String> left = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "one", 2, "two");
    PTable<Integer, String> right = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 2, "2A", 2, "2B");
    PCollection<Pair<String, String>> joined = OneToManyJoin.oneToManyJoin(left, right, new StringJoinFn(), Avros.pairs(Avros.strings(), Avros.strings()));
    List<Pair<String, String>> expected = ImmutableList.of(Pair.of("two", "2A,2B"));
    replacedertEquals(expected, Lists.newArrayList(joined.materialize()));
}

19 View Complete Implementation : Cartesian.java
Copyright Apache License 2.0
Author : apache
/**
 * Performs a full cross join on the specified {@link PTable}s (using the same
 * strategy as Pig's CROSS operator).
 *
 * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
 *      Join</a>
 * @param left
 *          A PTable to perform a cross join on.
 * @param right
 *          A PTable to perform a cross join on.
 * @param parallelism
 *          The square root of the number of reducers to use. Increasing
 *          parallelism also increases copied data.
 * @param <K1>
 *          Type of left PTable's keys.
 * @param <K2>
 *          Type of right PTable's keys.
 * @param <U>
 *          Type of the first {@link PTable}'s values
 * @param <V>
 *          Type of the second {@link PTable}'s values
 * @return The joined result as tuples of ((K1,K2), (U,V)).
 */
public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right, int parallelism) {
    /*
     * The strategy here is to simply emulate the following PigLatin: A =
     * foreach table1 generate flatten(GFCross(0, 2)), flatten(*); B = foreach
     * table2 generate flatten(GFCross(1, 2)), flatten(*); C = cogroup A by ($0,
     * $1), B by ($0, $1); result = foreach C generate flatten(A), flatten(B);
     */
    PTypeFamily ltf = left.getTypeFamily();
    PTypeFamily rtf = right.getTypeFamily();
    PTable<Pair<Integer, Integer>, Pair<K1, U>> leftCross = left.parallelDo(new GFCross<Pair<K1, U>>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), ltf.pairs(left.getKeyType(), left.getValueType())));
    PTable<Pair<Integer, Integer>, Pair<K2, V>> rightCross = right.parallelDo(new GFCross<Pair<K2, V>>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), rtf.pairs(right.getKeyType(), right.getValueType())));
    PTable<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>> cg = leftCross.join(rightCross);
    PTypeFamily ctf = cg.getTypeFamily();
    return cg.parallelDo(new MapFn<Pair<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>>, Pair<Pair<K1, K2>, Pair<U, V>>>() {

        @Override
        public Pair<Pair<K1, K2>, Pair<U, V>> map(Pair<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>> input) {
            Pair<Pair<K1, U>, Pair<K2, V>> valuePair = input.second();
            return Pair.of(Pair.of(valuePair.first().first(), valuePair.second().first()), Pair.of(valuePair.first().second(), valuePair.second().second()));
        }
    }, ctf.tableOf(ctf.pairs(left.getKeyType(), right.getKeyType()), ctf.pairs(left.getValueType(), right.getValueType())));
}

19 View Complete Implementation : PercentilesTest.java
Copyright Apache License 2.0
Author : spotify
@Test
public void testPercentilesBetween() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 5, // We expect the 0.5 to correspond to this element, according to the "nearest rank" %ile definition.
    "a", // We expect the 0.5 to correspond to this element, according to the "nearest rank" %ile definition.
    2, "a", 4, "a", 1);
    Map<String, Result<Integer>> actualS = Percentiles.distributed(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> actualM = Percentiles.inMemory(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(4, Pair.of(0.5, 2)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : MapredIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testReducer() throws Exception {
    Pipeline p = new MRPipeline(MapredIT.clreplaced, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {

        @Override
        public Pair<IntWritable, Text> map(String input) {
            return Pair.of(new IntWritable(input.length()), new Text(input));
        }
    }, Writables.tableOf(Writables.writables(IntWritable.clreplaced), Writables.writables(Text.clreplaced)));
    PTable<Text, LongWritable> out = Mapred.reduce(two.groupByKey(), TestReducer.clreplaced, Text.clreplaced, LongWritable.clreplaced);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    replacedertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    replacedertEquals(103, sr.getCounters().findCounter("thou", "count").getValue());
}

19 View Complete Implementation : AverageTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testMeanValue() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 2, "a", 10, "b", 3, "c", 3, "c", 4, "c", 5);
    Map<String, Double> actual = Average.meanValue(testTable).materializeToMap();
    Map<String, Double> expected = ImmutableMap.of("a", 6.0, "b", 3.0, "c", 4.0);
    replacedertEquals(expected, actual);
}

19 View Complete Implementation : MapreduceIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testMapper() throws Exception {
    Pipeline p = new MRPipeline(MapreduceIT.clreplaced, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {

        @Override
        public Pair<IntWritable, Text> map(String input) {
            return Pair.of(new IntWritable(input.length()), new Text(input));
        }
    }, Writables.tableOf(Writables.writables(IntWritable.clreplaced), Writables.writables(Text.clreplaced)));
    PTable<IntWritable, Text> out = Mapreduce.map(two, TestMapper.clreplaced, IntWritable.clreplaced, Text.clreplaced);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    replacedertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    replacedertEquals(3285, sr.getCounters().findCounter("written", "out").getValue());
}

19 View Complete Implementation : PTablesTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testSwapKeyValue() {
    PTable<String, Long> table = MemPipeline.typedTableOf(tableOf(strings(), longs()), "hello", 14L, "goodbye", 21L);
    PTable<Long, String> actual = PTables.swapKeyValue(table);
    Map<Long, String> expected = ImmutableMap.of(14L, "hello", 21L, "goodbye");
    replacedertEquals(expected, actual.materializeToMap());
}

19 View Complete Implementation : OneToManyJoinTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testOneToMany_MultipleValuesForSameKeyOnLeft() {
    PTable<Integer, String> left = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "one", 2, "two", 1, "oneExtra");
    PTable<Integer, String> right = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "1A", 1, "1B", 2, "2A", 2, "2B");
    PCollection<Pair<String, String>> joined = OneToManyJoin.oneToManyJoin(left, right, new StringJoinFn(), Avros.pairs(Avros.strings(), Avros.strings()));
    List<Pair<String, String>> expected = ImmutableList.of(Pair.of("one", "1A,1B"), Pair.of("two", "2A,2B"));
    replacedertEquals(expected, Lists.newArrayList(joined.materialize()));
}

19 View Complete Implementation : Aggregate.java
Copyright Apache License 2.0
Author : apache
/**
 * Returns the number of elements in the provided PCollection.
 *
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
    PTypeFamily tf = collect.getTypeFamily();
    PTable<Integer, Long> countTable = collect.parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {

        public Pair<Integer, Long> map(S input) {
            return Pair.of(1, 1L);
        }

        public void cleanup(Emitter<Pair<Integer, Long>> e) {
            e.emit(Pair.of(1, 0L));
        }
    }, tf.tableOf(tf.ints(), tf.longs())).groupByKey(GroupingOptions.builder().numReducers(1).build()).combineValues(Aggregators.SUM_LONGS());
    PCollection<Long> count = countTable.values();
    return new FirstElementPObject<Long>(count, 0L);
}

19 View Complete Implementation : QuantilesTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testQuantilesLessThanOrEqual() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 10, "a", 20, "a", 30, "a", 40, "a", 50, "a", 60, "a", 70, "a", 80, "a", 90, "a", 100);
    Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(10, Pair.of(0.5, 50)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : JoinTester.java
Copyright Apache License 2.0
Author : apache
protected PTable<String, Long> join(PCollection<String> w1, PCollection<String> w2, PTypeFamily ptf) {
    PTableType<String, Long> ntt = ptf.tableOf(ptf.strings(), ptf.longs());
    PTable<String, Long> ws1 = Aggregate.count(w1.parallelDo("ws1", new WordSplit(), ptf.strings()));
    PTable<String, Long> ws2 = Aggregate.count(w2.parallelDo("ws2", new WordSplit(), ptf.strings()));
    JoinStrategy<String, Long, Long> joinStrategy = getJoinStrategy();
    PTable<String, Pair<Long, Long>> join = joinStrategy.join(ws1, ws2, getJoinType());
    PTable<String, Long> sums = join.parallelDo("cnt", new DoFn<Pair<String, Pair<Long, Long>>, Pair<String, Long>>() {

        @Override
        public void process(Pair<String, Pair<Long, Long>> input, Emitter<Pair<String, Long>> emitter) {
            Pair<Long, Long> pair = input.second();
            long sum = (pair.first() != null ? pair.first() : 0) + (pair.second() != null ? pair.second() : 0);
            emitter.emit(Pair.of(input.first(), sum));
        }
    }, ntt);
    return sums;
}

19 View Complete Implementation : CrunchDatasets.java
Copyright Apache License 2.0
Author : kite-sdk
/**
 * Parreplacedions {@code collection} to be stored efficiently in {@code View}.
 * <p>
 * This restructures the parallel collection so that all of the enreplacedies that
 * will be stored in a given parreplacedion will be evenly distributed across a specified
 * {@code numParreplacedionWriters}.
 * <p>
 * If the dataset is not parreplacedioned, then this will structure all of the
 * enreplacedies to produce a number of files equal to {@code numWriters}.
 *
 * @param collection a collection of enreplacedies
 * @param view a {@link View} of a dataset to parreplacedion the collection for
 * @param numWriters the number of writers that should be used
 * @param numParreplacedionWriters the number of writers data for a single parreplacedion will be distributed across
 * @param <E> the type of enreplacedies in the collection and underlying dataset
 * @return an equivalent collection of enreplacedies parreplacedioned for the view
 * @see #parreplacedion(PCollection, View)
 *
 * @since 1.1.0
 */
public static <E> PCollection<E> parreplacedion(PCollection<E> collection, View<E> view, int numWriters, int numParreplacedionWriters) {
    // ensure the number of writers is honored whether it is per parreplacedion or total.
    DatasetDescriptor descriptor = view.getDataset().getDescriptor();
    if (descriptor.isParreplacedioned()) {
        GetStorageKey<E> getKey = new GetStorageKey<E>(view, numParreplacedionWriters);
        PTable<Pair<GenericData.Record, Integer>, E> table = collection.by(getKey, Avros.pairs(Avros.generics(getKey.schema()), Avros.ints()));
        PGroupedTable<Pair<GenericData.Record, Integer>, E> grouped = numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey();
        return grouped.ungroup().values();
    } else {
        return parreplacedion(collection, numWriters);
    }
}

19 View Complete Implementation : Sample.java
Copyright Apache License 2.0
Author : apache
/**
 * The weighted reservoir sampling function with the seed term exposed for testing purposes.
 *
 * @param input the weighted observations
 * @param sampleSize The number of elements to select
 * @param seed The test seed
 * @return A random sample of the given size that respects the weighting values
 */
public static <T, N extends Number> PCollection<T> weightedReservoirSample(PCollection<Pair<T, N>> input, int sampleSize, Long seed) {
    PTypeFamily ptf = input.getTypeFamily();
    PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {

        @Override
        public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
            return Pair.of(0, p);
        }
    }, ptf.tableOf(ptf.ints(), input.getPType()));
    int[] ss = { sampleSize };
    return groupedWeightedReservoirSample(groupedIn, ss, seed).parallelDo("Extract sampled value from pair", new MapFn<Pair<Integer, T>, T>() {

        @Override
        public T map(Pair<Integer, T> p) {
            return p.second();
        }
    }, (PType<T>) input.getPType().getSubTypes().get(0));
}

19 View Complete Implementation : PercentilesTest.java
Copyright Apache License 2.0
Author : spotify
@Test
public void testPercentilesExact() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 5, "a", 2, "a", 3, "a", 4, "a", 1);
    Map<String, Result<Integer>> actualS = Percentiles.distributed(testTable, 0, 0.5, 1.0).materializeToMap();
    Map<String, Result<Integer>> actualM = Percentiles.inMemory(testTable, 0, 0.5, 1.0).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(5, Pair.of(0.0, 1), Pair.of(0.5, 3), Pair.of(1.0, 5)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : AggregatorsIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testPairAggregator() {
    PCollection<String> lines = pipeline.readTextFile(Tests.pathTo(this, "ints.txt"));
    PTable<String, Pair<Integer, Integer>> table = lines.parallelDo(new SplitLine(), tableOf(strings(), pairs(ints(), ints())));
    PTable<String, Pair<Integer, Integer>> combinedTable = table.groupByKey().combineValues(pairAggregator(SUM_INTS(), SUM_INTS()));
    Map<String, Pair<Integer, Integer>> result = combinedTable.asMap().getValue();
    replacedertThat(result.size(), is(2));
    replacedertThat(result.get("a"), is(Pair.of(9, 12)));
    replacedertThat(result.get("b"), is(Pair.of(11, 13)));
}

19 View Complete Implementation : TopList.java
Copyright Apache License 2.0
Author : apache
/**
 * Create a top-list of elements in the provided PTable, categorised by the key of the input table and using the count
 * of the value part of the input table. Example: if input = Table(Country, Track), then this will give you the most
 * common n tracks for each country.
 * @param input table of X Y pairs
 * @param n How many Y values to include in the toplist per X (this will be in memory, so don't make this ridiculous)
 * @param <X> group type
 * @param <Y> value type
 * @return table of each unique X value mapped to a collection of (count, Y) pairs
 */
public static <X, Y> PTable<X, Collection<Pair<Long, Y>>> topNYbyX(PTable<X, Y> input, final int n) {
    final PType<X> xType = input.getKeyType();
    final PType<Y> yType = input.getValueType();
    PTypeFamily f = xType.getFamily();
    PTable<X, Pair<Long, Y>> counted = input.count().parallelDo(new MapFn<Pair<Pair<X, Y>, Long>, Pair<X, Pair<Long, Y>>>() {

        @Override
        public Pair<X, Pair<Long, Y>> map(Pair<Pair<X, Y>, Long> input) {
            return Pair.of(input.first().first(), Pair.of(-input.second(), input.first().second()));
        }
    }, f.tableOf(xType, f.pairs(f.longs(), yType)));
    return SecondarySort.sortAndApply(counted, new MapFn<Pair<X, Iterable<Pair<Long, Y>>>, Pair<X, Collection<Pair<Long, Y>>>>() {

        private PTableType<Long, Y> tableType;

        @Override
        public void initialize() {
            PTypeFamily ptf = yType.getFamily();
            tableType = ptf.tableOf(ptf.longs(), yType);
            tableType.initialize(getConfiguration());
        }

        @Override
        public Pair<X, Collection<Pair<Long, Y>>> map(Pair<X, Iterable<Pair<Long, Y>>> input) {
            Collection<Pair<Long, Y>> values = Lists.newArrayList();
            Iterator<Pair<Long, Y>> iter = input.second().iterator();
            for (int i = 0; i < n; i++) {
                if (!iter.hasNext()) {
                    break;
                }
                Pair<Long, Y> pair = PTables.getDetachedValue(tableType, iter.next());
                values.add(Pair.of(-pair.first(), pair.second()));
            }
            return Pair.of(input.first(), values);
        }
    }, f.tableOf(xType, f.collections(f.pairs(f.longs(), yType))));
}

19 View Complete Implementation : MapreduceIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testReducer() throws Exception {
    Pipeline p = new MRPipeline(MapredIT.clreplaced, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {

        @Override
        public Pair<IntWritable, Text> map(String input) {
            return Pair.of(new IntWritable(input.length()), new Text(input));
        }
    }, Writables.tableOf(Writables.writables(IntWritable.clreplaced), Writables.writables(Text.clreplaced)));
    PTable<Text, LongWritable> out = Mapreduce.reduce(two.groupByKey(), TestReducer.clreplaced, Text.clreplaced, LongWritable.clreplaced);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    replacedertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    replacedertEquals(19, sr.getCounters().findCounter("words", "where").getValue());
}

19 View Complete Implementation : CartesianTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testCartesianCollection_Tables() {
    PTable<String, Integer> leftTable = MemPipeline.typedTableOf(Writables.tableOf(Writables.strings(), Writables.ints()), "a", 1, "b", 2);
    PTable<String, Float> rightTable = MemPipeline.typedTableOf(Writables.tableOf(Writables.strings(), Writables.floats()), "A", 1.0f, "B", 2.0f);
    PTable<Pair<String, String>, Pair<Integer, Float>> cartesianProduct = Cartesian.cross(leftTable, rightTable);
    List<Pair<Pair<String, String>, Pair<Integer, Float>>> expectedResults = Lists.newArrayList();
    expectedResults.add(Pair.of(Pair.of("a", "A"), Pair.of(1, 1.0f)));
    expectedResults.add(Pair.of(Pair.of("a", "B"), Pair.of(1, 2.0f)));
    expectedResults.add(Pair.of(Pair.of("b", "A"), Pair.of(2, 1.0f)));
    expectedResults.add(Pair.of(Pair.of("b", "B"), Pair.of(2, 2.0f)));
    List<Pair<Pair<String, String>, Pair<Integer, Float>>> actualResults = Lists.newArrayList(cartesianProduct.materialize());
    Collections.sort(actualResults);
    replacedertEquals(expectedResults, actualResults);
}

19 View Complete Implementation : CrunchETL.java
Copyright Apache License 2.0
Author : apache
@Override
public Map<String, ? extends Number> numberOfProductsByProduct() throws Exception {
    PTable<String, Long> counts = lineItems.parallelDo(COUNT_BY_PRODUCT, Avros.strings()).count();
    Map m = counts.materializeToMap();
    // CrunchETL. System.out.println("Crunch:::  " + m);
    return m;
}

19 View Complete Implementation : OneToManyJoinTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testOneToMany_UnmatchedLeftSide() {
    PTable<Integer, String> left = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 2, "two");
    PTable<Integer, String> right = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "1A", 1, "1B", 2, "2A", 2, "2B");
    PCollection<Pair<String, String>> joined = OneToManyJoin.oneToManyJoin(left, right, new StringJoinFn(), Avros.pairs(Avros.strings(), Avros.strings()));
    List<Pair<String, String>> expected = ImmutableList.of(Pair.of("two", "2A,2B"));
    replacedertEquals(expected, Lists.newArrayList(joined.materialize()));
}

19 View Complete Implementation : SecondarySortIT.java
Copyright Apache License 2.0
Author : apache
public void runSecondarySort(PTypeFamily ptf) throws Exception {
    Pipeline p = new MRPipeline(SecondarySorreplaced.clreplaced, tempDir.getDefaultConfiguration());
    String inputFile = tempDir.copyResourceFileName("secondary_sort_input.txt");
    PTable<String, Pair<Integer, Integer>> in = p.read(From.textFile(inputFile)).parallelDo(new MapFn<String, Pair<String, Pair<Integer, Integer>>>() {

        @Override
        public Pair<String, Pair<Integer, Integer>> map(String input) {
            String[] pieces = input.split(",");
            return Pair.of(pieces[0], Pair.of(Integer.valueOf(pieces[1].trim()), Integer.valueOf(pieces[2].trim())));
        }
    }, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.ints(), ptf.ints())));
    Iterable<String> lines = SecondarySort.sortAndApply(in, new MapFn<Pair<String, Iterable<Pair<Integer, Integer>>>, String>() {

        @Override
        public String map(Pair<String, Iterable<Pair<Integer, Integer>>> input) {
            Joiner j = Joiner.on(',');
            return j.join(input.first(), j.join(input.second()));
        }
    }, ptf.strings()).materialize();
    replacedertEquals(ImmutableList.of("one,[-5,10],[1,1],[2,-3]", "three,[0,-1]", "two,[1,7],[2,6],[4,5]"), ImmutableList.copyOf(lines));
    p.done();
}

19 View Complete Implementation : MapreduceTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testMapper() throws Exception {
    PTable<Integer, String> in = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "foot", 2, "ball", 3, "bazzar");
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<Pair<Integer, String>, Pair<IntWritable, Text>>() {

        @Override
        public Pair<IntWritable, Text> map(Pair<Integer, String> input) {
            return Pair.of(new IntWritable(input.first()), new Text(input.second()));
        }
    }, Writables.tableOf(Writables.writables(IntWritable.clreplaced), Writables.writables(Text.clreplaced)));
    PTable<IntWritable, Text> out = Mapreduce.map(two, TestMapper.clreplaced, IntWritable.clreplaced, Text.clreplaced);
    replacedertEquals(ImmutableList.of($2(4, "foot"), $2(4, "ball"), $2(6, "bazzar")), Lists.newArrayList(out.materialize()));
}

19 View Complete Implementation : OneToManyJoinTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testOneToMany() {
    PTable<Integer, String> left = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "one", 2, "two");
    PTable<Integer, String> right = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "1A", 1, "1B", 2, "2A", 2, "2B");
    PCollection<Pair<String, String>> joined = OneToManyJoin.oneToManyJoin(left, right, new StringJoinFn(), Avros.pairs(Avros.strings(), Avros.strings()));
    List<Pair<String, String>> expected = ImmutableList.of(Pair.of("one", "1A,1B"), Pair.of("two", "2A,2B"));
    replacedertEquals(expected, Lists.newArrayList(joined.materialize()));
}

19 View Complete Implementation : PTableBase.java
Copyright Apache License 2.0
Author : apache
@Override
public PTable<K, V> union(PTable<K, V>... others) {
    List<PTableBase<K, V>> internal = Lists.newArrayList();
    internal.add(this);
    for (PTable<K, V> table : others) {
        internal.add((PTableBase<K, V>) table);
    }
    return pipeline.getFactory().createUnionTable(internal);
}

19 View Complete Implementation : QuantilesTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testQuantilesExact() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 5, "a", 2, "a", 3, "a", 4, "a", 1);
    Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0, 0.5, 1.0).materializeToMap();
    Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0, 0.5, 1.0).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(5, Pair.of(0.0, 1), Pair.of(0.5, 3), Pair.of(1.0, 5)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : JoinTester.java
Copyright Apache License 2.0
Author : apache
protected void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
    String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
    String dickensInputPath = tmpDir.copyResourceFileName("dickens.txt");
    PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
    PCollection<String> dickens = pipeline.readTextFile(dickensInputPath);
    PTable<String, Long> joined = join(shakespeare, dickens, typeFamily);
    Iterable<Pair<String, Long>> lines = joined.materialize();
    replacedertPreplaceded(lines);
    pipeline.done();
}

19 View Complete Implementation : QuantilesTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testQuantilesNines() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 10, "a", 20, "a", 30, "a", 40, "a", 50, "a", 60, "a", 70, "a", 80, "a", 90, "a", 100);
    Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0.9, 0.99).materializeToMap();
    Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0.9, 0.99).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(10, Pair.of(0.9, 90), Pair.of(0.99, 100)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : Average.java
Copyright Apache License 2.0
Author : apache
/**
 * Calculate the mean average value by key for a table with numeric values.
 * @param table PTable of (key, value) pairs to operate on
 * @param <K> Key type, can be any type
 * @param <V> Value type, must be numeric (ie. extend java.lang.Number)
 * @return PTable<K, Double> of (key, mean(values)) pairs
 */
public static <K, V extends Number> PTable<K, Double> meanValue(PTable<K, V> table) {
    PTypeFamily ptf = table.getTypeFamily();
    PTable<K, Pair<Double, Long>> withCounts = table.mapValues(new MapFn<V, Pair<Double, Long>>() {

        @Override
        public Pair<Double, Long> map(V input) {
            return Pair.of(input.doubleValue(), 1L);
        }
    }, ptf.pairs(ptf.doubles(), ptf.longs()));
    PGroupedTable<K, Pair<Double, Long>> grouped = withCounts.groupByKey();
    return grouped.combineValues(pairAggregator(SUM_DOUBLES(), SUM_LONGS())).mapValues(new MapFn<Pair<Double, Long>, Double>() {

        @Override
        public Double map(Pair<Double, Long> input) {
            return input.first() / input.second();
        }
    }, ptf.doubles());
}

19 View Complete Implementation : SecondarySortTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testInMemory() throws Exception {
    PTable<Long, Pair<Long, String>> input = MemPipeline.typedTableOf(tableOf(longs(), pairs(longs(), strings())), 1729L, Pair.of(17L, "a"), 100L, Pair.of(29L, "b"), 1729L, Pair.of(29L, "c"));
    PCollection<String> letters = SecondarySort.sortAndApply(input, new StringifyFn(), strings());
    replacedertEquals(ImmutableList.of("b", "ac"), letters.materialize());
}

19 View Complete Implementation : BloomFilterFactory.java
Copyright Apache License 2.0
Author : apache
/**
 * The method will take an input path and generates BloomFilters for all text
 * files in that path. The method return back a {@link PObject} containing a
 * {@link Map} having file names as keys and filters as values
 */
public static PObject<Map<String, BloomFilter>> createFilter(Path inputPath, BloomFilterFn<String> filterFn) throws IOException {
    MRPipeline pipeline = new MRPipeline(BloomFilterFactory.clreplaced);
    FileStatus[] listStatus = FileSystem.get(pipeline.getConfiguration()).listStatus(inputPath);
    PTable<String, BloomFilter> filterTable = null;
    for (FileStatus fileStatus : listStatus) {
        Path path = fileStatus.getPath();
        PCollection<String> readTextFile = pipeline.readTextFile(path.toString());
        pipeline.getConfiguration().set(BloomFilterFn.CRUNCH_FILTER_NAME, path.getName());
        PTable<String, BloomFilter> currentTable = createFilterTable(readTextFile, filterFn);
        if (filterTable != null) {
            filterTable = filterTable.union(currentTable);
        } else {
            filterTable = currentTable;
        }
    }
    return filterTable.asMap();
}

19 View Complete Implementation : CrunchDatasets.java
Copyright Apache License 2.0
Author : kite-sdk
private static <E> PCollection<E> parreplacedion(PCollection<E> collection, int numReducers) {
    PType<E> type = collection.getPType();
    PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls());
    PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType);
    PGroupedTable<E, Void> grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey();
    return grouped.ungroup().keys();
}

19 View Complete Implementation : Cartesian.java
Copyright Apache License 2.0
Author : apache
/**
 * Performs a full cross join on the specified {@link PCollection}s (using the
 * same strategy as Pig's CROSS operator).
 *
 * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
 *      Join</a>
 * @param left
 *          A PCollection to perform a cross join on.
 * @param right
 *          A PCollection to perform a cross join on.
 * @param <U>
 *          Type of the first {@link PCollection}'s values
 * @param <V>
 *          Type of the second {@link PCollection}'s values
 * @return The joined result as tuples of (U,V).
 */
public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right, int parallelism) {
    PTypeFamily ltf = left.getTypeFamily();
    PTypeFamily rtf = right.getTypeFamily();
    PTableType<Pair<Integer, Integer>, U> ptt = ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType());
    if (ptt == null)
        throw new Error();
    PTable<Pair<Integer, Integer>, U> leftCross = left.parallelDo(new GFCross<U>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType()));
    PTable<Pair<Integer, Integer>, V> rightCross = right.parallelDo(new GFCross<V>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType()));
    PTable<Pair<Integer, Integer>, Pair<U, V>> cg = leftCross.join(rightCross);
    PTypeFamily ctf = cg.getTypeFamily();
    return cg.parallelDo("Extract second element", new MapFn<Pair<Pair<Integer, Integer>, Pair<U, V>>, Pair<U, V>>() {

        @Override
        public Pair<U, V> map(Pair<Pair<Integer, Integer>, Pair<U, V>> input) {
            return input.second();
        }
    }, ctf.pairs(left.getPType(), right.getPType()));
}

19 View Complete Implementation : AveragesTest.java
Copyright Apache License 2.0
Author : spotify
@Test
public void testMeanValue() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 2, "a", 10, "b", 3, "c", 3, "c", 4, "c", 5);
    Map<String, Double> actual = Averages.meanValue(testTable).materializeToMap();
    Map<String, Double> expected = ImmutableMap.of("a", 6.0, "b", 3.0, "c", 4.0);
    replacedertEquals(expected, actual);
}

19 View Complete Implementation : MapredIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testMapper() throws Exception {
    Pipeline p = new MRPipeline(MapredIT.clreplaced, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {

        @Override
        public Pair<IntWritable, Text> map(String input) {
            return Pair.of(new IntWritable(input.length()), new Text(input));
        }
    }, Writables.tableOf(Writables.writables(IntWritable.clreplaced), Writables.writables(Text.clreplaced)));
    PTable<Text, LongWritable> out = Mapred.map(two, TestMapper.clreplaced, Text.clreplaced, LongWritable.clreplaced);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    replacedertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    replacedertEquals(3285, sr.getCounters().findCounter("written", "out").getValue());
}

19 View Complete Implementation : PercentilesTest.java
Copyright Apache License 2.0
Author : spotify
@Test
public void testPercentilesNines() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 10, "a", 20, "a", 30, "a", 40, "a", 50, "a", 60, "a", 70, "a", 80, "a", 90, "a", 100);
    Map<String, Result<Integer>> actualS = Percentiles.distributed(testTable, 0.9, 0.99).materializeToMap();
    Map<String, Result<Integer>> actualM = Percentiles.inMemory(testTable, 0.9, 0.99).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(10, Pair.of(0.9, 90), Pair.of(0.99, 100)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : Sort.java
Copyright Apache License 2.0
Author : apache
/**
 * Sorts the {@code PCollection} of {@link TupleN}s using the specified column
 * ordering and a client-specified number of reducers.
 *
 * @return a {@code PCollection} representing the sorted collection.
 */
public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection, int numReducers, ColumnOrder... columnOrders) {
    PType<T> pType = collection.getPType();
    SortFns.KeyExtraction<T> ke = new SortFns.KeyExtraction<T>(pType, columnOrders);
    PTable<Object, T> pt = collection.by(ke.getByFn(), ke.getKeyType());
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, columnOrders);
    return pt.groupByKey(options).ungroup().values();
}

19 View Complete Implementation : PercentilesTest.java
Copyright Apache License 2.0
Author : spotify
@Test
public void testPercentilesLessThanOrEqual() {
    PTable<String, Integer> testTable = MemPipeline.typedTableOf(tableOf(strings(), ints()), "a", 10, "a", 20, "a", 30, "a", 40, "a", 50, "a", 60, "a", 70, "a", 80, "a", 90, "a", 100);
    Map<String, Result<Integer>> actualS = Percentiles.distributed(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> actualM = Percentiles.inMemory(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(10, Pair.of(0.5, 50)));
    replacedertEquals(expected, actualS);
    replacedertEquals(expected, actualM);
}

19 View Complete Implementation : CrunchETL.java
Copyright Apache License 2.0
Author : apache
@Override
public Map<String, ? extends Number> numberOfTransactionsByState() throws Exception {
    PTable<String, Long> counts = lineItems.parallelDo(COUNT_BY_STATE, Avros.strings()).count();
    Map m = counts.materializeToMap();
    System.out.println("Crunch:::  " + m);
    return m;
}

19 View Complete Implementation : TopListsTest.java
Copyright Apache License 2.0
Author : spotify
@Test
public void testTopNYbyX() {
    PTable<String, String> data = MemPipeline.typedTableOf(tableOf(strings(), strings()), "a", "x", "a", "x", "a", "x", "a", "y", "a", "y", "a", "z", "b", "x", "b", "x", "b", "z");
    Map<String, Collection<Pair<Long, String>>> actual = TopLists.topNYbyX(data, 2).materializeToMap();
    Map<String, Collection<Pair<Long, String>>> expected = ImmutableMap.of("a", collectionOf(Pair.of(3L, "x"), Pair.of(2L, "y")), "b", collectionOf(Pair.of(2L, "x"), Pair.of(1L, "z")));
    replacedertEquals(expected, actual);
}

19 View Complete Implementation : AvroModeIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testGenericReflectConflict() throws IOException {
    final Random rand = new Random();
    rand.setSeed(12345);
    Configuration conf = new Configuration();
    Pipeline pipeline = new MRPipeline(AvroModeIT.clreplaced, conf);
    Source<GenericData.Record> source = From.avroFile(tmpDir.copyResourceFileName("strings-100.avro"), Avros.generics(GENERIC_SCHEMA));
    PTable<Long, float[]> mapPhase = pipeline.read(source).parallelDo(new DoFn<GenericData.Record, Pair<Long, float[]>>() {

        @Override
        public void process(GenericData.Record input, Emitter<Pair<Long, float[]>> emitter) {
            emitter.emit(Pair.of(Long.valueOf(input.get("text").toString().length()), new float[] { rand.nextFloat(), rand.nextFloat() }));
        }
    }, Avros.tableOf(Avros.longs(), FLOAT_ARRAY));
    PTable<Long, float[]> result = mapPhase.groupByKey().combineValues(new Aggregator<float[]>() {

        float[] acreplacedulator = null;

        @Override
        public Iterable<float[]> results() {
            return ImmutableList.of(acreplacedulator);
        }

        @Override
        public void initialize(Configuration conf) {
        }

        @Override
        public void reset() {
            this.acreplacedulator = null;
        }

        @Override
        public void update(float[] value) {
            if (acreplacedulator == null) {
                acreplacedulator = Arrays.copyOf(value, 2);
            } else {
                for (int i = 0; i < value.length; i += 1) {
                    acreplacedulator[i] += value[i];
                }
            }
        }
    });
    pipeline.writeTextFile(result, tmpDir.getFileName("unused"));
    replacedert.replacedertTrue("Should succeed", pipeline.done().succeeded());
}

19 View Complete Implementation : BloomFilterFactory.java
Copyright Apache License 2.0
Author : apache
private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) {
    PTypeFamily tf = collection.getTypeFamily();
    PTable<String, BloomFilter> table = collection.parallelDo(filterFn, tf.tableOf(tf.strings(), Writables.writables(BloomFilter.clreplaced)));
    return table.groupByKey(1).combineValues(new BloomFilterAggregator());
}

19 View Complete Implementation : CSVFileSourceIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testVanillaCSVWithAdditionalActions() throws Exception {
    final String[] expectedFileContents = { "1,2,3,4", "5,6,7,8", "9,10,11", "12,13,14" };
    final String vanillaCSVFile = tmpDir.copyResourceFileName("vanilla.csv");
    final Pipeline pipeline = new MRPipeline(CSVFileSourceIT.clreplaced, tmpDir.getDefaultConfiguration());
    final PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(vanillaCSVFile)));
    final PTable<String, Long> countTable = csvLines.count();
    final PCollection<String> csvLines2 = countTable.keys();
    final Collection<String> csvLinesList = csvLines2.asCollection().getValue();
    for (int i = 0; i < expectedFileContents.length; i++) {
        replacedertTrue(csvLinesList.contains(expectedFileContents[i]));
    }
}

19 View Complete Implementation : SortByValueIT.java
Copyright Apache License 2.0
Author : apache
public void run(Pipeline pipeline, PTypeFamily ptf) throws Exception {
    String sbv = tmpDir.copyResourceFileName("sort_by_value.txt");
    PTable<String, Long> letterCounts = pipeline.read(From.textFile(sbv)).parallelDo(new SplitFn("\t"), ptf.tableOf(ptf.strings(), ptf.longs()));
    PCollection<Pair<String, Long>> sorted = Sort.sortPairs(letterCounts, new ColumnOrder(2, Order.DESCENDING), new ColumnOrder(1, Order.ASCENDING));
    replacedertEquals(ImmutableList.of(Pair.of("C", 3L), Pair.of("A", 2L), Pair.of("D", 2L), Pair.of("B", 1L), Pair.of("E", 1L)), ImmutableList.copyOf(sorted.materialize()));
}

19 View Complete Implementation : MapredTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testMapper() throws Exception {
    PTable<Integer, String> in = MemPipeline.typedTableOf(Avros.tableOf(Avros.ints(), Avros.strings()), 1, "foot", 2, "ball", 3, "bazzar");
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<Pair<Integer, String>, Pair<IntWritable, Text>>() {

        @Override
        public Pair<IntWritable, Text> map(Pair<Integer, String> input) {
            return Pair.of(new IntWritable(input.first()), new Text(input.second()));
        }
    }, Writables.tableOf(Writables.writables(IntWritable.clreplaced), Writables.writables(Text.clreplaced)));
    PTable<Text, LongWritable> out = Mapred.map(two, TestMapper.clreplaced, Text.clreplaced, LongWritable.clreplaced);
    replacedertEquals(ImmutableList.of($("foot", 4), $("ball", 4), $("bazzar", 6)), Lists.newArrayList(out.materialize()));
}

18 View Complete Implementation : FormattedFileIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testReadFormattedFile() throws Exception {
    String urlsFile = tmpDir.copyResourceFileName("urls.txt");
    Pipeline p = new MRPipeline(FormattedFileIT.clreplaced, tmpDir.getDefaultConfiguration());
    PTable<LongWritable, Text> urls = p.read(From.formattedFile(urlsFile, TextInputFormat.clreplaced, LongWritable.clreplaced, Text.clreplaced));
    List<String> expect = ImmutableList.of("A", "A", "A", "B", "B", "C", "D", "E", "F", "F", "");
    List<String> actual = Lists.newArrayList(Iterables.transform(urls.materialize(), new Function<Pair<LongWritable, Text>, String>() {

        @Override
        public String apply(Pair<LongWritable, Text> pair) {
            String str = pair.second().toString();
            if (str.isEmpty()) {
                return str;
            }
            return str.substring(4, 5);
        }
    }));
    replacedertEquals(expect, actual);
    p.done();
}

18 View Complete Implementation : TextFileTableIT.java
Copyright Apache License 2.0
Author : apache
@Test
public void testTextFileTable() throws Exception {
    String urlsFile = tmpDir.copyResourceFileName("urls.txt");
    Pipeline pipeline = new MRPipeline(TextFileTableIT.clreplaced, tmpDir.getDefaultConfiguration());
    PTable<String, String> urls = pipeline.read(new TextFileTableSource<String, String>(urlsFile, tableOf(strings(), strings())));
    Set<Pair<String, Long>> cnts = ImmutableSet.copyOf(urls.keys().count().materialize());
    replacedertEquals(ImmutableSet.of(Pair.of("www.A.com", 4L), Pair.of("www.B.com", 2L), Pair.of("www.C.com", 1L), Pair.of("www.D.com", 1L), Pair.of("www.E.com", 1L), Pair.of("www.F.com", 2L)), cnts);
}