org.apache.hadoop.mapreduce.lib.input.FileSplit - java examples

Here are the examples of the java api org.apache.hadoop.mapreduce.lib.input.FileSplit taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

155 Examples 7

19 View Complete Implementation : TemporaryInputFormat.java
Copyright Apache License 2.0
Author : asakusafw
@Override
public RecordReader<NullWritable, T> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit s = (FileSplit) split;
    replacedert s.getStart() % TemporaryFile.BLOCK_SIZE == 0;
    replacedert s.getStart() > 0 || s.getLength() > 0;
    return createRecordReader();
}

19 View Complete Implementation : BAMInputFormat.java
Copyright MIT License
Author : HadoopGenomics
// Works the same way as addIndexedSplits, to avoid having to reopen the
// file repeatedly and checking addIndexedSplits for an index repeatedly.
private int addProbabilisticSplits(List<InputSplit> splits, int i, List<InputSplit> newSplits, Configuration cfg) throws IOException {
    final Path path = ((FileSplit) splits.get(i)).getPath();
    try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) {
        final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);
        FileVirtualSplit previousSplit = null;
        for (; i < splits.size(); ++i) {
            FileSplit fspl = (FileSplit) splits.get(i);
            if (!fspl.getPath().equals(path))
                break;
            long beg = fspl.getStart();
            long end = beg + fspl.getLength();
            long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);
            // As the guesser goes to the next BGZF block before looking for BAM
            // records, the ending BGZF blocks have to always be traversed fully.
            // Hence force the length to be 0xffff, the maximum possible.
            long alignedEnd = end << 16 | 0xffff;
            if (alignedBeg == end) {
                // No records detected in this split: merge it to the previous one.
                // This could legitimately happen e.g. if we have a split that is
                // so small that it only contains the middle part of a BGZF block.
                // 
                // Of course, if it's the first split, then this is simply not a
                // valid BAM file.
                // 
                // FIXME: In theory, any number of splits could only contain parts
                // of the BAM header before we start to see splits that contain BAM
                // records. For now, we require that the split size is at least as
                // big as the header and don't handle that case.
                if (previousSplit == null)
                    throw new IOException("'" + path + "': " + "no reads in first split: bad BAM file or tiny split size?");
                previousSplit.setEndVirtualOffset(alignedEnd);
            } else {
                previousSplit = new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations());
                if (logger.isDebugEnabled()) {
                    final long byteOffset = alignedBeg >>> 16;
                    final long recordOffset = alignedBeg & 0xffff;
                    logger.debug("Split {}: byte offset: {} record offset: {}, virtual offset: {}", i, byteOffset, recordOffset, alignedBeg);
                }
                newSplits.add(previousSplit);
            }
        }
    }
    return i;
}

19 View Complete Implementation : TemporaryInputFormat.java
Copyright Apache License 2.0
Author : asakusafw
private static List<FileSplit> createSplits(Path path, BlockMap blockMap, long start, long end, long splitSize) {
    if (start >= end) {
        return Collections.emptyList();
    }
    if (splitSize <= 0) {
        FileSplit split = getSplit(blockMap, path, start, end);
        return Collections.singletonList(split);
    }
    long threashold = (long) (splitSize * 1.2);
    List<FileSplit> results = new ArrayList<>();
    long current = start;
    while (current < end) {
        long next;
        if (end - current < threashold) {
            next = end;
        } else {
            next = current + splitSize;
        }
        FileSplit split = getSplit(blockMap, path, current, next);
        results.add(split);
        current = next;
    }
    return results;
}

19 View Complete Implementation : TeraScheduler.java
Copyright Apache License 2.0
Author : apache
/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
    solve();
    FileSplit[] result = new FileSplit[realSplits.length];
    int left = 0;
    int right = realSplits.length - 1;
    for (int i = 0; i < splits.length; ++i) {
        if (splits[i].isreplacedigned) {
            // copy the split and fix up the locations
            String[] newLocations = { splits[i].locations.get(0).hostname };
            realSplits[i] = new FileSplit(realSplits[i].getPath(), realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
            result[left++] = realSplits[i];
        } else {
            result[right--] = realSplits[i];
        }
    }
    List<InputSplit> ret = new ArrayList<InputSplit>();
    for (FileSplit fs : result) {
        ret.add(fs);
    }
    return ret;
}

19 View Complete Implementation : BAMInputFormat.java
Copyright MIT License
Author : HadoopGenomics
// Handles all the splits that share the Path of the one at index i,
// returning the next index to be used.
private int addIndexedSplits(List<InputSplit> splits, int i, List<InputSplit> newSplits, Configuration cfg) throws IOException {
    final Path file = ((FileSplit) splits.get(i)).getPath();
    List<InputSplit> potentialSplits = new ArrayList<InputSplit>();
    final SplittingBAMIndex idx = new SplittingBAMIndex(file.getFileSystem(cfg).open(getIdxPath(file)));
    int splitsEnd = splits.size();
    for (int j = i; j < splitsEnd; ++j) if (!file.equals(((FileSplit) splits.get(j)).getPath()))
        splitsEnd = j;
    if (idx.size() == 1) {
        // no alignments, only the file size, so no splits to add
        return splitsEnd;
    }
    for (int j = i; j < splitsEnd; ++j) {
        final FileSplit fileSplit = (FileSplit) splits.get(j);
        final long start = fileSplit.getStart();
        final long end = start + fileSplit.getLength();
        final Long blockStart = idx.nextAlignment(start);
        // The last split needs to end where the last alignment ends, but the
        // index doesn't store that data (whoops); we only know where the last
        // alignment begins. Fortunately there's no need to change the index
        // format for this: we can just set the end to the maximal length of
        // the final BGZF block (0xffff), and then read until BAMRecordCodec
        // hits EOF.
        Long blockEnd;
        if (j == splitsEnd - 1) {
            blockEnd = idx.prevAlignment(end) | 0xffff;
        } else {
            blockEnd = idx.nextAlignment(end);
        }
        if (blockStart == null || blockEnd == null) {
            logger.warn("Index for {} was not good. Generating probabilistic splits.", file);
            return addProbabilisticSplits(splits, i, newSplits, cfg);
        }
        potentialSplits.add(new FileVirtualSplit(file, blockStart, blockEnd, fileSplit.getLocations()));
    }
    for (InputSplit s : potentialSplits) {
        newSplits.add(s);
    }
    return splitsEnd;
}

19 View Complete Implementation : FlagMakerMetricsMapper.java
Copyright Apache License 2.0
Author : NationalSecurityAgency
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    InputSplit split = context.getInputSplit();
    System.out.println(split.getClreplaced());
    if (split instanceof FileSplit) {
        FileSplit fsplit = (FileSplit) split;
        System.out.println(fsplit.getPath());
    }
    super.setup(context);
}

19 View Complete Implementation : CombineDocumentSplit.java
Copyright Apache License 2.0
Author : marklogic
public void readFields(DataInput in) throws IOException {
    // splits
    int splitSize = in.readInt();
    splits = new ArrayList<FileSplit>();
    for (int i = 0; i < splitSize; i++) {
        Path path = new Path(Text.readString(in));
        long start = in.readLong();
        long len = in.readLong();
        FileSplit split = new FileSplit(path, start, len, null);
        splits.add(split);
    }
    // length
    length = in.readLong();
    // locations
    locations = new HashSet<String>();
}

19 View Complete Implementation : TemporaryInputFormat.java
Copyright Apache License 2.0
Author : asakusafw
private static FileSplit getSplit(BlockMap blockMap, Path path, long start, long end) {
    DirectInputFragment f = blockMap.get(start, end);
    List<String> owners = f.getOwnerNodeNames();
    FileSplit split = new FileSplit(path, start, end - start, owners.toArray(new String[owners.size()]));
    return split;
}

19 View Complete Implementation : TestCRAMInputFormat.java
Copyright MIT License
Author : HadoopGenomics
private void checkSplits(int splitMaxSize) throws IOException {
    // test.cram has containers at positions 1069 and 3403. The file length is 3433.
    // expected splits = 1069+2334, 3403+30
    jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, splitMaxSize);
    CRAMInputFormat inputFormat = new CRAMInputFormat();
    List<InputSplit> splits = inputFormat.getSplits(jobContext);
    replacedertEquals(2, splits.size());
    FileSplit split0 = (FileSplit) splits.get(0);
    FileSplit split1 = (FileSplit) splits.get(1);
    replacedertEquals(1069, split0.getStart());
    replacedertEquals(2334, split0.getLength());
    replacedertEquals(3403, split1.getStart());
    replacedertEquals(30, split1.getLength());
}

19 View Complete Implementation : CSVReaderBase.java
Copyright Apache License 2.0
Author : NationalSecurityAgency
public void initializeRawFileName(final InputSplit genericSplit) {
    if (genericSplit instanceof FileSplit) {
        final FileSplit fs = (FileSplit) genericSplit;
        rawFileName = fs.getPath().getName();
    }
}

19 View Complete Implementation : TeraScheduler.java
Copyright Apache License 2.0
Author : aliyun-beta
/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
    solve();
    FileSplit[] result = new FileSplit[realSplits.length];
    int left = 0;
    int right = realSplits.length - 1;
    for (int i = 0; i < splits.length; ++i) {
        if (splits[i].isreplacedigned) {
            // copy the split and fix up the locations
            String[] newLocations = { splits[i].locations.get(0).hostname };
            realSplits[i] = new FileSplit(realSplits[i].getPath(), realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
            result[left++] = realSplits[i];
        } else {
            result[right--] = realSplits[i];
        }
    }
    List<InputSplit> ret = new ArrayList<InputSplit>();
    for (FileSplit fs : result) {
        ret.add(fs);
    }
    return ret;
}

19 View Complete Implementation : TemporaryInputFormatTest.java
Copyright Apache License 2.0
Author : asakusafw
private FileSplit find(List<FileSplit> splits, long start) {
    for (FileSplit split : splits) {
        if (split.getStart() == start) {
            return split;
        }
    }
    throw new replacedertionError(start);
}

19 View Complete Implementation : IngestMetricsMapper.java
Copyright Apache License 2.0
Author : NationalSecurityAgency
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    InputSplit split = context.getInputSplit();
    if (split instanceof FileSplit) {
        FileSplit fsplit = (FileSplit) split;
    }
    super.setup(context);
}

18 View Complete Implementation : TestUniformSizeInputFormat.java
Copyright Apache License 2.0
Author : aliyun-beta
private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
    long lastEnd = 0;
    // Verify if each split's start is matching with the previous end and
    // we are not missing anything
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        long start = fileSplit.getStart();
        replacedert.replacedertEquals(lastEnd, start);
        lastEnd = start + fileSplit.getLength();
    }
    // Verify there is nothing more to read from the input file
    SequenceFile.Reader reader = new SequenceFile.Reader(cluster.getFileSystem().getConf(), SequenceFile.Reader.file(listFile));
    try {
        reader.seek(lastEnd);
        CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
        Text srcRelPath = new Text();
        replacedert.replacedertFalse(reader.next(srcRelPath, srcFileStatus));
    } finally {
        IOUtils.closeStream(reader);
    }
}

18 View Complete Implementation : TestUniformSizeInputFormat.java
Copyright Apache License 2.0
Author : apache
private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
    long lastEnd = 0;
    // Verify if each split's start is matching with the previous end and
    // we are not missing anything
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        long start = fileSplit.getStart();
        replacedert.replacedertEquals(lastEnd, start);
        lastEnd = start + fileSplit.getLength();
    }
    // Verify there is nothing more to read from the input file
    SequenceFile.Reader reader = new SequenceFile.Reader(cluster.getFileSystem().getConf(), SequenceFile.Reader.file(listFile));
    try {
        reader.seek(lastEnd);
        CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
        Text srcRelPath = new Text();
        replacedert.replacedertFalse(reader.next(srcRelPath, srcFileStatus));
    } finally {
        IOUtils.closeStream(reader);
    }
}

18 View Complete Implementation : IndexScanMapper.java
Copyright Apache License 2.0
Author : twitter-archive
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    /**
     * compute the base input file name based on the index data's filepath
     */
    String indexdir = BlockIndexedFileInputFormat.getIndexDir(context);
    FileSplit split = (FileSplit) context.getInputSplit();
    String indexdatapath = split.getPath().toUri().getPath();
    int start = (indexdir.charAt(indexdir.length() - 1) == '/') ? indexdir.length() - 1 : indexdir.length();
    if (indexdatapath.charAt(indexdatapath.length() - 1) == '/')
        indexdatapath = indexdatapath.substring(0, indexdatapath.length() - 1);
    String s1 = indexdatapath.substring(0, indexdatapath.lastIndexOf('/'));
    String s2 = s1.substring(0, s1.lastIndexOf('/'));
    int end = s2.lastIndexOf('/');
    baseFileName = indexdatapath.substring(start, end);
    columnName = context.getConfiguration().get(searchColumnName);
    LOG.info("baseFileName:" + baseFileName + " columnName:" + columnName);
}

18 View Complete Implementation : VCFInputFormat.java
Copyright MIT License
Author : HadoopGenomics
/**
 * Defers to {@link BCFSplitGuesser} as appropriate for each individual
 * path. VCF paths do not require special handling, so their splits are left
 * unchanged.
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    if (this.conf == null)
        this.conf = job.getConfiguration();
    final List<InputSplit> origSplits = super.getSplits(job);
    // We have to parreplacedion the splits by input format and hand the BCF ones
    // over to getBCFSplits().
    final List<FileSplit> bcfOrigSplits = new ArrayList<FileSplit>(origSplits.size());
    final List<InputSplit> newSplits = new ArrayList<InputSplit>(origSplits.size());
    for (final InputSplit iSplit : origSplits) {
        final FileSplit split = (FileSplit) iSplit;
        if (VCFFormat.BCF.equals(getFormat(split.getPath())))
            bcfOrigSplits.add(split);
        else
            newSplits.add(split);
    }
    fixBCFSplits(bcfOrigSplits, newSplits);
    return filterByInterval(newSplits, conf);
}

18 View Complete Implementation : VCFInputFormat.java
Copyright MIT License
Author : HadoopGenomics
// Handles all the splits that share the Path of the one at index i,
// returning the next index to be used.
private int addGuessedSplits(List<FileSplit> splits, int i, List<InputSplit> newSplits) throws IOException {
    final Path path = splits.get(i).getPath();
    final SeekableStream sin = WrapSeekable.openPath(conf, path);
    final BCFSplitGuesser guesser = new BCFSplitGuesser(sin);
    final boolean isBGZF = guesser.isBGZF();
    InputSplit prevSplit = null;
    for (; i < splits.size(); ++i) {
        final FileSplit fspl = splits.get(i);
        if (!fspl.getPath().equals(path))
            break;
        final String[] locs = fspl.getLocations();
        final long beg = fspl.getStart();
        final long end = beg + fspl.getLength();
        final long alignBeg = guesser.guessNextBCFRecordStart(beg, end);
        // As the guesser goes to the next BGZF block before looking for BCF
        // records, the ending BGZF blocks have to always be traversed fully.
        // Hence force the length to be 0xffff, the maximum possible.
        final long alignEnd = isBGZF ? end << 16 | 0xffff : end;
        final long length = alignEnd - alignBeg;
        if (alignBeg == end) {
            // No records detected in this split: merge it to the previous one.
            // This could legitimately happen e.g. if we have a split that is
            // so small that it only contains the middle part of a BGZF block.
            // 
            // Of course, if it's the first split, then this is simply not a
            // valid BCF file.
            // 
            // FIXME: In theory, any number of splits could only contain parts
            // of the BCF header before we start to see splits that contain BCF
            // records. For now, we require that the split size is at least as
            // big as the header and don't handle that case.
            if (prevSplit == null)
                throw new IOException("'" + path + "': no records in first " + "split: bad BCF file or tiny split size?");
            if (isBGZF) {
                ((FileVirtualSplit) prevSplit).setEndVirtualOffset(alignEnd);
                continue;
            }
            prevSplit = new FileSplit(path, alignBeg, length, locs);
            newSplits.remove(newSplits.size() - 1);
        } else {
            prevSplit = isBGZF ? new FileVirtualSplit(path, alignBeg, alignEnd, locs) : new FileSplit(path, alignBeg, length, locs);
        }
        newSplits.add(prevSplit);
    }
    sin.close();
    return i;
}

18 View Complete Implementation : HadoopExecutableManager.java
Copyright Apache License 2.0
Author : sigmoidanalytics
private void writeDebugHeader() {
    processError("===== Task Information Header =====");
    processError("\nCommand: " + command);
    processError("\nStart time: " + new Date(System.currentTimeMillis()));
    if (job.getBoolean(MRConfiguration.TASK_IS_MAP, false)) {
        MapContext context = (MapContext) PigMapReduce.sJobContext;
        PigSplit pigSplit = (PigSplit) context.getInputSplit();
        int numPaths = pigSplit.getNumPaths();
        processError("\nPigSplit contains " + numPaths + " wrappedSplits.");
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < numPaths; i++) {
            InputSplit wrappedSplit = pigSplit.getWrappedSplit(i);
            if (wrappedSplit instanceof FileSplit) {
                FileSplit mapInputFileSplit = (FileSplit) wrappedSplit;
                sb.append("\nInput-split: file=");
                sb.append(mapInputFileSplit.getPath());
                sb.append(" start-offset=");
                sb.append(Long.toString(mapInputFileSplit.getStart()));
                sb.append(" length=");
                sb.append(Long.toString(mapInputFileSplit.getLength()));
                processError(sb.toString());
                sb.setLength(0);
            }
        }
    }
    processError("\n=====          * * *          =====\n");
}

18 View Complete Implementation : CombineDocumentSplit.java
Copyright Apache License 2.0
Author : marklogic
public void addSplit(FileSplit split) throws IOException, InterruptedException {
    splits.add(split);
    length += split.getLength();
    for (String loc : split.getLocations()) {
        if (!locations.contains(loc)) {
            locations.add(loc);
        }
    }
}

17 View Complete Implementation : AvroRecordReaderTest.java
Copyright Apache License 2.0
Author : GoogleCloudDataproc
@Test
public void testMultipleSplits() throws IOException {
    long fileLength = testAvroFile.length();
    List<FileSplit> splits = new ArrayList<>();
    Path hadoopPath = new Path("file", null, testAvroFile.getAbsolutePath());
    for (int blockStart = 0; blockStart < fileLength; blockStart += AUTO_SYNC_INTERVAL) {
        splits.add(new FileSplit(hadoopPath, blockStart, AUTO_SYNC_INTERVAL, new String[0]));
    }
    List<String> allRecordKeys = new ArrayList<>();
    long totalFileRecords = 0;
    for (FileSplit split : splits) {
        try (AvroRecordReader reader = new AvroRecordReader()) {
            reader.initializeInternal(split, new Configuration());
            List<String> keysInSplit = collectRecordKeys(reader);
            allRecordKeys.addAll(keysInSplit);
            int recordsInSplit = keysInSplit.size();
            totalFileRecords += recordsInSplit;
            // Not all 'blocks' contain records, but none should have all records
            Truth.replacedertThat(recordsInSplit).isLessThan(RECORD_COUNT);
        }
    }
    Truth.replacedertThat(allRecordKeys).containsExactlyElementsIn(allAddedKeys);
    Truth.replacedertThat(totalFileRecords).isEqualTo(RECORD_COUNT);
}

17 View Complete Implementation : CRAMInputFormat.java
Copyright MIT License
Author : HadoopGenomics
public List<InputSplit> getSplits(List<InputSplit> splits, Configuration conf) throws IOException {
    // update splits to align with CRAM container boundaries
    List<InputSplit> newSplits = new ArrayList<InputSplit>();
    Map<Path, List<Long>> fileToOffsets = new HashMap<Path, List<Long>>();
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        Path path = fileSplit.getPath();
        List<Long> containerOffsets = fileToOffsets.get(path);
        if (containerOffsets == null) {
            containerOffsets = getContainerOffsets(conf, path);
            fileToOffsets.put(path, containerOffsets);
        }
        long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart());
        long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() + fileSplit.getLength());
        long newLength = newEnd - newStart;
        if (newLength == 0) {
            // split is wholly within a container
            continue;
        }
        FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength, fileSplit.getLocations());
        newSplits.add(newSplit);
    }
    return newSplits;
}

17 View Complete Implementation : VCFInputFormat.java
Copyright MIT License
Author : HadoopGenomics
private List<InputSplit> filterByInterval(List<InputSplit> splits, Configuration conf) throws IOException {
    List<Interval> intervals = getIntervals(conf);
    if (intervals == null) {
        return splits;
    }
    List<Block> blocks = new ArrayList<>();
    Set<Path> vcfFiles = new LinkedHashSet<Path>();
    for (InputSplit split : splits) {
        if (split instanceof FileSplit) {
            vcfFiles.add(((FileSplit) split).getPath());
        } else if (split instanceof FileVirtualSplit) {
            vcfFiles.add(((FileVirtualSplit) split).getPath());
        } else {
            throw new IllegalArgumentException("split '" + split + "' has unknown type: cannot extract path");
        }
    }
    for (Path vcfFile : vcfFiles) {
        Path indexFile = vcfFile.suffix(TabixUtils.STANDARD_INDEX_EXTENSION);
        FileSystem fs = vcfFile.getFileSystem(conf);
        if (!fs.exists(indexFile)) {
            logger.warn("No tabix index file found for {}, splits will not be filtered, which may be very inefficient", indexFile);
            return splits;
        }
        try (InputStream in = new BlockCompressedInputStream(fs.open(indexFile))) {
            TabixIndex index = new TabixIndex(in);
            for (Locatable interval : intervals) {
                String contig = interval.getContig();
                int intervalStart = interval.getStart();
                int intervalEnd = interval.getEnd();
                blocks.addAll(index.getBlocks(contig, intervalStart, intervalEnd));
            }
        }
    }
    // Use the blocks to filter the splits
    List<InputSplit> filteredSplits = new ArrayList<InputSplit>();
    for (InputSplit split : splits) {
        if (split instanceof FileSplit) {
            FileSplit fileSplit = (FileSplit) split;
            long splitStart = fileSplit.getStart() << 16;
            long splitEnd = (fileSplit.getStart() + fileSplit.getLength()) << 16;
            // if any block overlaps with the split, keep the split, but don't adjust its size
            // as the BGZF block decompression is handled by BGZFCodec, not by the reader
            // directly
            for (Block block : blocks) {
                long blockStart = block.getStartPosition();
                long blockEnd = block.getEndPosition();
                if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) {
                    filteredSplits.add(split);
                    break;
                }
            }
        } else {
            FileVirtualSplit virtualSplit = (FileVirtualSplit) split;
            long splitStart = virtualSplit.getStartVirtualOffset();
            long splitEnd = virtualSplit.getEndVirtualOffset();
            // if any block overlaps with the split, keep the split, but adjust the start and
            // end to the maximally overlapping portion for all blocks that overlap
            long newStart = Long.MAX_VALUE;
            long newEnd = Long.MIN_VALUE;
            boolean overlaps = false;
            for (Block block : blocks) {
                long blockStart = block.getStartPosition();
                long blockEnd = block.getEndPosition();
                if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) {
                    long overlapStart = Math.max(splitStart, blockStart);
                    long overlapEnd = Math.min(splitEnd, blockEnd);
                    newStart = Math.min(newStart, overlapStart);
                    newEnd = Math.max(newEnd, overlapEnd);
                    overlaps = true;
                }
            }
            if (overlaps) {
                filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), newStart, newEnd, virtualSplit.getLocations()));
            }
        }
    }
    return filteredSplits;
}

17 View Complete Implementation : TemporaryInputFormatTest.java
Copyright Apache License 2.0
Author : asakusafw
/**
 * computing splits with forcibly splitting.
 */
@Test
public void splits_force() {
    BlockMap blocks = blocks("testing", TemporaryFile.BLOCK_SIZE * 10);
    List<FileSplit> splits = TemporaryInputFormat.computeSplits(new Path("testing"), blocks, TemporaryFile.BLOCK_SIZE + 1);
    replacedertThat(splits, hreplacedize(5));
    FileSplit s0 = find(splits, TemporaryFile.BLOCK_SIZE * 0);
    replacedertThat(s0.getLength(), is((long) TemporaryFile.BLOCK_SIZE * 2));
    FileSplit s1 = find(splits, TemporaryFile.BLOCK_SIZE * 2);
    replacedertThat(s1.getLength(), is((long) TemporaryFile.BLOCK_SIZE * 2));
    FileSplit s2 = find(splits, TemporaryFile.BLOCK_SIZE * 4);
    replacedertThat(s2.getLength(), is((long) TemporaryFile.BLOCK_SIZE * 2));
    FileSplit s3 = find(splits, TemporaryFile.BLOCK_SIZE * 6);
    replacedertThat(s3.getLength(), is((long) TemporaryFile.BLOCK_SIZE * 2));
    FileSplit s4 = find(splits, TemporaryFile.BLOCK_SIZE * 8);
    replacedertThat(s4.getLength(), is((long) TemporaryFile.BLOCK_SIZE * 2));
}

17 View Complete Implementation : TestTmpFileCompression.java
Copyright Apache License 2.0
Author : sigmoidanalytics
@Test
public void testTFileRecordWriterReaderAndProgress() throws Exception {
    // Create a small tFile by pig tfilewriter, read by tfilereader and
    // make sure that data matches,
    // progress is above zero and increasing
    File tFile = File.createTempFile("test", "tfile");
    Path basicTFile = new Path(tFile.getAbsolutePath());
    // delete the empty file and let TFileRecordWriter create it again.
    tFile.delete();
    Configuration conf = new Configuration();
    conf.set("tfile.io.chunk.size", "100");
    conf.set("fs.default.name", "file:///");
    for (String codec : new String[] { "none", "gz" }) {
        System.err.println("Testing RecordWriter/Reader with codec: " + codec);
        try {
            TFileRecordWriter writer = new TFileRecordWriter(basicTFile, codec, conf);
            Tuple tuple = TupleFactory.getInstance().newTuple(1);
            int LOOP_SIZE = 25000;
            for (int i = 0; i <= LOOP_SIZE; i++) {
                String key = String.format("%010d", i);
                tuple.set(0, key);
                writer.write(null, tuple);
            }
            writer.close(null);
            int size = (int) tFile.length();
            FileSplit split = new FileSplit(basicTFile, 0, size, null);
            TFileRecordReader reader = new TFileRecordReader();
            reader.initialize(split, HadoopShims.createTaskAttemptContext(conf, HadoopShims.createTaskAttemptID("jt", 1, true, 1, 1)));
            float progress = 0, lastprogress = 0;
            int curval = 0, prevval = -1;
            while (reader.nextKeyValue()) {
                Tuple t = (Tuple) reader.getCurrentValue();
                curval = Integer.valueOf((String) t.get(0));
                replacedertEquals("Unexpected Value", curval, prevval + 1);
                prevval = curval;
                progress = reader.getProgress();
                if (progress != lastprogress) {
                    System.err.println("progress: " + progress);
                }
                replacedertTrue("Progress is not positive", progress > 0);
                replacedertTrue("Progress is not increasing", progress >= lastprogress);
                lastprogress = progress;
            }
            replacedertEquals("Last value does not match", curval, LOOP_SIZE);
            reader.close();
        } finally {
            tFile.delete();
        }
    }
}

17 View Complete Implementation : TemporaryInputFormatTest.java
Copyright Apache License 2.0
Author : asakusafw
/**
 * simple case for computing splits.
 */
@Test
public void splits_simple() {
    BlockMap blocks = blocks("testing", m(10));
    List<FileSplit> splits = TemporaryInputFormat.computeSplits(new Path("testing"), blocks, m(64));
    replacedertThat(splits, hreplacedize(1));
    FileSplit s0 = find(splits, 0);
    replacedertThat(s0.getLength(), is(m(10)));
}

17 View Complete Implementation : TemporaryInputFormatTest.java
Copyright Apache License 2.0
Author : asakusafw
/**
 * computing splits w/ suppress.
 */
@Test
public void splits_suppress() {
    BlockMap blocks = blocks("testing", TemporaryFile.BLOCK_SIZE * 10);
    List<FileSplit> splits = TemporaryInputFormat.computeSplits(new Path("testing"), blocks, 0);
    replacedertThat(splits, hreplacedize(1));
    FileSplit s0 = find(splits, 0);
    replacedertThat(s0.getLength(), is((long) TemporaryFile.BLOCK_SIZE * 10));
}

17 View Complete Implementation : AnySAMInputFormat.java
Copyright MIT License
Author : HadoopGenomics
/**
 * Defers to {@link BAMInputFormat} or {@link CRAMInputFormat} as appropriate for each
 * individual path. SAM paths do not require special handling, so their splits are left
 * unchanged.
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    if (this.conf == null)
        this.conf = job.getConfiguration();
    final List<InputSplit> origSplits = BAMInputFormat.removeIndexFiles(super.getSplits(job));
    // We have to parreplacedion the splits by input format and hand them over to
    // the *InputFormats for any further handling.
    // 
    // BAMInputFormat and CRAMInputFormat need to change the split boundaries, so we can
    // just extract the BAM and CRAM ones and leave the rest as they are.
    final List<InputSplit> bamOrigSplits = new ArrayList<InputSplit>(origSplits.size()), cramOrigSplits = new ArrayList<InputSplit>(origSplits.size()), newSplits = new ArrayList<InputSplit>(origSplits.size());
    for (final InputSplit iSplit : origSplits) {
        final FileSplit split = (FileSplit) iSplit;
        if (SAMFormat.BAM.equals(getFormat(split.getPath())))
            bamOrigSplits.add(split);
        else if (SAMFormat.CRAM.equals(getFormat(split.getPath())))
            cramOrigSplits.add(split);
        else
            newSplits.add(split);
    }
    newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration()));
    newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration()));
    return newSplits;
}

17 View Complete Implementation : MneMapreducePersonDataTest.java
Copyright Apache License 2.0
Author : apache
@Test(enabled = true, dependsOnMethods = { "testWritePersonData" })
public void testReadPersonData() throws Exception {
    long sumage = 0L;
    long reccnt = 0L;
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
            FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
            InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat = new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
            RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader = inputFormat.createRecordReader(split, m_tacontext);
            MneDurableInputValue<Person<Long>> personval = null;
            while (reader.nextKeyValue()) {
                personval = reader.getCurrentValue();
                replacedertJUnit.replacedertTrue(personval.getValue().getAge() < 51);
                sumage += personval.getValue().getAge();
                ++reccnt;
            }
            reader.close();
        }
    }
    replacedertJUnit.replacedertEquals(m_reccnt, reccnt);
    replacedertJUnit.replacedertEquals(m_sumage, sumage);
    System.out.println(String.format("The checksum of ages is %d", sumage));
}

17 View Complete Implementation : JsonRecordReader.java
Copyright Apache License 2.0
Author : NationalSecurityAgency
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    super.initialize(split, context);
    if (!(split instanceof FileSplit)) {
        throw new IOException("Cannot handle split type " + split.getClreplaced().getName());
    }
    FileSplit fsplit = (FileSplit) split;
    Path file = fsplit.getPath();
    rawFileName = file.getName();
    fileURI = file.toUri();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    InputStream is = fs.open(file);
    start = fsplit.getStart();
    end = start + fsplit.getLength();
    pos = start;
    String normURI = fileURI.getScheme() + "://" + fileURI.getPath();
    setupReader(is);
    if (logger.isInfoEnabled()) {
        logger.info("Reading Json records from " + normURI + " via " + is.getClreplaced().getName());
    }
    jsonHelper = (JsonDataTypeHelper) createHelper(context.getConfiguration());
    this.parseHeaderOnly = !jsonHelper.processExtraFields();
    jsonFlattener = jsonHelper.newFlattener();
    if (logger.isInfoEnabled()) {
        logger.info("Json flattener mode: " + jsonFlattener.getFlattenMode().name());
    }
}

17 View Complete Implementation : StreamingDocumentReader.java
Copyright Apache License 2.0
Author : marklogic
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
    if (iterator.hasNext()) {
        FileSplit split = iterator.next();
        setFile(split.getPath());
        String uri = makeURIFromPath(file);
        if (setKey(uri, 0, 0, true)) {
            return true;
        }
        value = new StreamLocator(file, CompressionCodec.NONE);
        bytesRead += split.getLength();
        return true;
    }
    return false;
}

17 View Complete Implementation : CorrelationTechniquesMapper.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : VIDA-NYU
@Override
public void setup(Context context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    String[] datasetNames = conf.get("dataset-names", "").split(",");
    String[] datasetIds = conf.get("dataset-keys", "").split(",");
    for (int i = 0; i < datasetNames.length; i++) datasetToId.put(datasetNames[i], datasetIds[i]);
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String[] fileSplitTokens = fileSplit.getPath().getParent().toString().split("/");
    String dataset = fileSplitTokens[fileSplitTokens.length - 1];
    datasetIdStr = datasetToId.get(dataset);
    String[] aggregates = context.getConfiguration().get("dataset-" + datasetIdStr + "-agg", "").split(",");
    index = new int[aggregates.length];
    for (int i = 0; i < aggregates.length; i++) index[i] = Integer.parseInt(aggregates[i].split("-")[0]);
    for (int i = 0; i < datasetIds.length; i++) {
        datasetAggSize.put(Integer.parseInt(datasetIds[i]), Integer.parseInt(conf.get("dataset-" + datasetIds[i] + "-agg-size", "0")));
    }
    if (conf.get("no-relationship", "").length() > 0) {
        String[] noRelationshipStr = conf.get("no-relationship").split(",");
        for (String relationship : noRelationshipStr) {
            String[] ids = relationship.split("-");
            if (Integer.parseInt(ids[0]) < Integer.parseInt(ids[1])) {
                noRelationship.add(relationship);
            } else {
                noRelationship.add(ids[1] + "-" + ids[0]);
            }
        }
    }
    String[] firstGroupStr = conf.get("first-group", "").split(",");
    String[] secondGroupStr = conf.get("second-group", "").split(",");
    for (String dt : firstGroupStr) {
        firstGroup.add(Integer.parseInt(dt));
    }
    for (String dt : secondGroupStr) {
        secondGroup.add(Integer.parseInt(dt));
    }
}

17 View Complete Implementation : AvroCompactionMapper.java
Copyright Apache License 2.0
Author : ExpediaInceCommercePlatform
@Override
protected void map(final AvroKey<GenericRecord> key, final Object value, final Context context) throws IOException, InterruptedException {
    final FileSplit fileSplit = (FileSplit) context.getInputSplit();
    final Path filePath = fileSplit.getPath();
    AvroValue<GenericRecord> record = new AvroValue<GenericRecord>(key.datum());
    if (isValidData(key, filePath)) {
        final Text mapperKey = baseMapper.getKey(filePath.toString());
        context.write(mapperKey, record);
    }
}

17 View Complete Implementation : AvroRecordReaderTest.java
Copyright Apache License 2.0
Author : GoogleCloudDataproc
@Test
public void testSingleSplit() throws IOException {
    FileSplit fileSplit = new FileSplit(new Path("file", null, testAvroFile.getAbsolutePath()), 0, testAvroFile.length(), new String[0]);
    AvroRecordReader recordReader = new AvroRecordReader();
    recordReader.initializeInternal(fileSplit, new Configuration());
    Truth.replacedertThat(remainingRecordCount(recordReader)).isEqualTo(RECORD_COUNT);
    recordReader.close();
}

17 View Complete Implementation : SSTableRecordReader.java
Copyright Apache License 2.0
Author : Knewton
/**
 * Performs all the necessary actions to initialize and prepare this record reader.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
    this.ctx = context;
    conf = context.getConfiguration();
    keysRead = 0;
    components = Sets.newHashSetWithExpectedSize(3);
    FileSplit split = (FileSplit) inputSplit;
    validateConfiguration(conf);
    // Get comparator. Subcomparator can be null.
    AbstractType<?> comparator = getConfComparator(conf);
    AbstractType<?> subcomparator = getConfSubComparator(conf);
    // Get parreplacedioner for keys
    IParreplacedioner parreplacedioner = getConfParreplacedioner(conf);
    // Move minimum required db tables to local disk.
    Path dataTablePath = split.getPath();
    FileSystem remoteFS = FileSystem.get(dataTablePath.toUri(), conf);
    FileSystem localFS = FileSystem.getLocal(conf);
    copyTablesToLocal(remoteFS, localFS, dataTablePath, context);
    CFMetaData cfMetaData;
    if (getConfIsSparse(conf)) {
        cfMetaData = CFMetaData.sparseCFMetaData(getDescriptor().ksname, getDescriptor().cfname, comparator);
    } else {
        cfMetaData = CFMetaData.denseCFMetaData(getDescriptor().ksname, getDescriptor().cfname, comparator, subcomparator);
    }
    // Open table and get scanner
    SSTableReader tableReader = openSSTableReader(parreplacedioner, cfMetaData);
    setTableScanner(tableReader);
}

17 View Complete Implementation : TestHiveColumnarStorage.java
Copyright Apache License 2.0
Author : sigmoidanalytics
private ColumnarStruct readRow(File outputFile, Path outputPath, String schema) throws IOException, InterruptedException, SerDeException {
    FileSplit fileSplit = new FileSplit(outputPath, 0L, outputFile.length(), (String[]) null);
    Path splitPath = fileSplit.getPath();
    RCFileRecordReader<LongWritable, BytesRefArrayWritable> rcFileRecordReader = new RCFileRecordReader<LongWritable, BytesRefArrayWritable>(new Configuration(false), new org.apache.hadoop.mapred.FileSplit(splitPath, fileSplit.getStart(), fileSplit.getLength(), new org.apache.hadoop.mapred.JobConf(conf)));
    LongWritable key = rcFileRecordReader.createKey();
    BytesRefArrayWritable value = rcFileRecordReader.createValue();
    rcFileRecordReader.next(key, value);
    rcFileRecordReader.close();
    ColumnarStruct struct = readColumnarStruct(value, schema);
    return struct;
}

17 View Complete Implementation : SeqCompactionMapper.java
Copyright Apache License 2.0
Author : ExpediaInceCommercePlatform
/**
 * {@inheritDoc}
 */
protected void map(final Object key, final Text value, final Context context) throws IOException, InterruptedException {
    if (value != null && value.toString() != null && value.toString().isEmpty()) {
        return;
    }
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    final Text mapperKey = baseMapper.getKey(fileSplit.getPath().toString());
    context.write(mapperKey, value);
}

17 View Complete Implementation : BGZFSplitFileInputFormat.java
Copyright MIT License
Author : HadoopGenomics
// Works the same way as addIndexedSplits, to avoid having to reopen the
// file repeatedly and checking addIndexedSplits for an index repeatedly.
private int addProbabilisticSplits(List<InputSplit> splits, int i, List<InputSplit> newSplits, Configuration cfg) throws IOException {
    final Path path = ((FileSplit) splits.get(i)).getPath();
    final FSDataInputStream in = path.getFileSystem(cfg).open(path);
    final BGZFSplitGuesser guesser = new BGZFSplitGuesser(in);
    FileSplit fspl;
    do {
        fspl = (FileSplit) splits.get(i);
        final long beg = fspl.getStart();
        final long end = beg + fspl.getLength();
        final long alignedBeg = guesser.guessNextBGZFBlockStart(beg, end);
        newSplits.add(new FileSplit(path, alignedBeg, end - alignedBeg, fspl.getLocations()));
        ++i;
    } while (i < splits.size() && fspl.getPath().equals(path));
    in.close();
    return i;
}

17 View Complete Implementation : BGZFSplitFileInputFormat.java
Copyright MIT License
Author : HadoopGenomics
// Handles all the splits that share the Path of the one at index i,
// returning the next index to be used.
private int addIndexedSplits(List<InputSplit> splits, int i, List<InputSplit> newSplits, Configuration cfg) throws IOException {
    final Path file = ((FileSplit) splits.get(i)).getPath();
    final BGZFBlockIndex idx = new BGZFBlockIndex(file.getFileSystem(cfg).open(getIdxPath(file)));
    int splitsEnd = splits.size();
    for (int j = i; j < splitsEnd; ++j) if (!file.equals(((FileSplit) splits.get(j)).getPath()))
        splitsEnd = j;
    for (int j = i; j < splitsEnd; ++j) {
        final FileSplit fileSplit = (FileSplit) splits.get(j);
        final long start = fileSplit.getStart();
        final long end = start + fileSplit.getLength();
        final Long blockStart = idx.prevBlock(start);
        final Long blockEnd = j == splitsEnd - 1 ? idx.prevBlock(end) : idx.nextBlock(end);
        if (blockStart == null)
            throw new RuntimeException("Internal error or invalid index: no block start for " + start);
        if (blockEnd == null)
            throw new RuntimeException("Internal error or invalid index: no block end for " + end);
        newSplits.add(new FileSplit(file, blockStart, blockEnd - blockStart, fileSplit.getLocations()));
    }
    return splitsEnd;
}

17 View Complete Implementation : FileInputLoadFunc.java
Copyright Apache License 2.0
Author : sigmoidanalytics
@Override
public WritableComparable<?> getSplitComparable(InputSplit split) throws IOException {
    FileSplit fileSplit = null;
    if (split instanceof FileSplit) {
        fileSplit = (FileSplit) split;
    } else {
        throw new RuntimeException("LoadFunc expected split of type FileSplit");
    }
    return new FileSplitComparable(fileSplit.getPath().toString(), fileSplit.getStart());
}

17 View Complete Implementation : UnshardedInputSplitTest.java
Copyright Apache License 2.0
Author : GoogleCloudDataproc
/**
 * Tests the toString method.
 */
@Test
public void testToString() {
    // Create a new InputSplit containing the values.
    FileSplit inputSplit = new FileSplit(PATH, START, START + LENGTH, new String[0]);
    UnshardedInputSplit bqInputSplit = new UnshardedInputSplit(PATH, START, START + LENGTH, new String[0]);
    // Test for correct construction
    replacedertThat(bqInputSplit.toString()).isEqualTo(inputSplit.toString());
}

16 View Complete Implementation : TemporaryInputFormatTest.java
Copyright Apache License 2.0
Author : asakusafw
/**
 * computing splits with already aligned blocks.
 */
@Test
public void splits_aligned() {
    BlockMap blocks = blocks("testing", TemporaryFile.BLOCK_SIZE, TemporaryFile.BLOCK_SIZE);
    List<FileSplit> splits = TemporaryInputFormat.computeSplits(new Path("testing"), blocks, m(64));
    replacedertThat(splits, hreplacedize(2));
    FileSplit s0 = find(splits, 0);
    replacedertThat(s0.getLength(), is((long) TemporaryFile.BLOCK_SIZE));
    FileSplit s1 = find(splits, TemporaryFile.BLOCK_SIZE);
    replacedertThat(s1.getLength(), is((long) TemporaryFile.BLOCK_SIZE));
}

16 View Complete Implementation : RawSequenceFileRecordReader.java
Copyright Apache License 2.0
Author : twitter
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
    Preconditions.checkNotNull(inputSplit, "InputSplit is null");
    Preconditions.checkNotNull(context, "TaskAttemptContext is null");
    Configuration conf = HadoopCompat.getConfiguration(context);
    FileSplit fileSplit = (FileSplit) inputSplit;
    Path path = fileSplit.getPath();
    // inhibit clreplaced loading during SequenceFile.Reader initialization
    reader = new SequenceFile.Reader(path.getFileSystem(conf), path, conf) {

        @Override
        public synchronized Clreplaced<?> getKeyClreplaced() {
            return BytesWritable.clreplaced;
        }

        @Override
        public synchronized Clreplaced<?> getValueClreplaced() {
            return BytesWritable.clreplaced;
        }
    };
    vbytes = this.reader.createValueBytes();
    start = fileSplit.getStart();
    if (start > reader.getPosition()) {
        reader.sync(start);
    }
    start = reader.getPosition();
    end = fileSplit.getStart() + fileSplit.getLength();
    more = start < end;
}

16 View Complete Implementation : TestSequenceFileStorage.java
Copyright Apache License 2.0
Author : twitter
@Test
public void readOutsidePig() throws ClreplacedCastException, ParseException, ClreplacedNotFoundException, InstantiationException, IllegalAccessException, IOException, InterruptedException {
    // simulate Pig front-end runtime
    final SequenceFileLoader<IntWritable, Text> storage = new SequenceFileLoader<IntWritable, Text>("-c " + IntWritableConverter.clreplaced.getName(), "-c " + TextConverter.clreplaced.getName());
    Job job = new Job();
    storage.setUDFContextSignature("12345");
    storage.setLocation(tempFilename, job);
    // simulate Pig back-end runtime
    RecordReader<DataInputBuffer, DataInputBuffer> reader = new RawSequenceFileRecordReader();
    FileSplit fileSplit = new FileSplit(new Path(tempFilename), 0, new File(tempFilename).length(), new String[] { "localhost" });
    TaskAttemptContext context = HadoopCompat.newTaskAttemptContext(HadoopCompat.getConfiguration(job), new TaskAttemptID());
    reader.initialize(fileSplit, context);
    InputSplit[] wrappedSplits = new InputSplit[] { fileSplit };
    int inputIndex = 0;
    List<OperatorKey> targetOps = Arrays.asList(new OperatorKey("54321", 0));
    int splitIndex = 0;
    PigSplit split = new PigSplit(wrappedSplits, inputIndex, targetOps, splitIndex);
    split.setConf(HadoopCompat.getConfiguration(job));
    storage.prepareToRead(reader, split);
    // read tuples and validate
    validate(new LoadFuncTupleIterator(storage));
}

16 View Complete Implementation : AbstractTestWritableConverter.java
Copyright Apache License 2.0
Author : twitter
@Test
public void readOutsidePig() throws ClreplacedCastException, ParseException, ClreplacedNotFoundException, InstantiationException, IllegalAccessException, IOException, InterruptedException {
    // simulate Pig front-end runtime
    final SequenceFileLoader<IntWritable, Text> loader = new SequenceFileLoader<IntWritable, Text>(String.format("-c %s", IntWritableConverter.clreplaced.getName()), String.format("-c %s %s", writableConverterClreplaced.getName(), writableConverterArguments));
    Job job = new Job();
    loader.setUDFContextSignature("12345");
    loader.setLocation(tempFilename, job);
    // simulate Pig back-end runtime
    final RecordReader<DataInputBuffer, DataInputBuffer> reader = new RawSequenceFileRecordReader();
    final FileSplit fileSplit = new FileSplit(new Path(tempFilename), 0, new File(tempFilename).length(), new String[] { "localhost" });
    final TaskAttemptContext context = HadoopCompat.newTaskAttemptContext(HadoopCompat.getConfiguration(job), new TaskAttemptID());
    reader.initialize(fileSplit, context);
    final InputSplit[] wrappedSplits = new InputSplit[] { fileSplit };
    final int inputIndex = 0;
    final List<OperatorKey> targetOps = Arrays.asList(new OperatorKey("54321", 0));
    final int splitIndex = 0;
    final PigSplit split = new PigSplit(wrappedSplits, inputIndex, targetOps, splitIndex);
    split.setConf(HadoopCompat.getConfiguration(job));
    loader.prepareToRead(reader, split);
    // read tuples and validate
    validate(new LoadFuncTupleIterator(loader));
}

16 View Complete Implementation : BCFRecordReader.java
Copyright MIT License
Author : HadoopGenomics
@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    isBGZF = spl instanceof FileVirtualSplit;
    if (isBGZF) {
        final FileVirtualSplit split = (FileVirtualSplit) spl;
        final Path file = split.getPath();
        final FileSystem fs = file.getFileSystem(ctx.getConfiguration());
        final FSDataInputStream inFile = fs.open(file);
        bci = new BlockCompressedInputStream(inFile);
        in = new PositionalBufferedStream(bci);
        initContigDict();
        inFile.seek(0);
        bci = new BlockCompressedInputStream(new WrapSeekable<FSDataInputStream>(inFile, fs.getFileStatus(file).getLen(), file));
        final long virtualStart = split.getStartVirtualOffset(), virtualEnd = split.getEndVirtualOffset();
        this.fileStart = virtualStart >>> 16;
        this.length = (virtualEnd >>> 16) - fileStart;
        bci.seek(virtualStart);
        // Since PositionalBufferedStream does its own buffering, we have to
        // prevent it from going too far by using a BGZFLimitingStream. It
        // also allows nextKeyValue() to simply check for EOF instead of
        // looking at virtualEnd.
        in = new PositionalBufferedStream(new BGZFLimitingStream(bci, virtualEnd));
    } else {
        final FileSplit split = (FileSplit) spl;
        this.fileStart = split.getStart();
        this.length = split.getLength();
        final Path file = split.getPath();
        in = new PositionalBufferedStream(file.getFileSystem(ctx.getConfiguration()).open(file));
        initContigDict();
        IOUtils.skipFully(in, fileStart - in.getPosition());
    }
}

16 View Complete Implementation : TemporaryInputFormatTest.java
Copyright Apache License 2.0
Author : asakusafw
/**
 * computing splits without unaligned blocks.
 */
@Test
public void splits_unaligned() {
    BlockMap blocks = blocks("testing", TemporaryFile.BLOCK_SIZE - 10, TemporaryFile.BLOCK_SIZE);
    List<FileSplit> splits = TemporaryInputFormat.computeSplits(new Path("testing"), blocks, m(128));
    replacedertThat(splits, hreplacedize(2));
    FileSplit s0 = find(splits, 0);
    replacedertThat(s0.getLength(), is((long) TemporaryFile.BLOCK_SIZE));
    FileSplit s1 = find(splits, TemporaryFile.BLOCK_SIZE);
    replacedertThat(s1.getLength(), is((long) TemporaryFile.BLOCK_SIZE - 10));
}

16 View Complete Implementation : GsonBigQueryInputFormatTest.java
Copyright Apache License 2.0
Author : GoogleCloudDataproc
/**
 * Tests getSplits method of GsonBigQueryInputFormat with federated data.
 */
@Test
public void testGetSplitsFederated() throws Exception {
    JobContext jobContext = new JobContextImpl(config, new JobID());
    table.setType("EXTERNAL").setExternalDataConfiguration(new ExternalDataConfiguration().setSourceFormat("NEWLINE_DELIMITED_JSON").setSourceUris(ImmutableList.of("gs://foo-bucket/bar.json")));
    FileSplit split = new FileSplit(new Path("gs://foo-bucket/bar.json"), 0, 100, new String[0]);
    when(mockInputFormat.getSplits(eq(jobContext))).thenReturn(ImmutableList.<InputSplit>of(split));
    GsonBigQueryInputFormat gsonBigQueryInputFormat = new GsonBigQueryInputFormatForTest();
    gsonBigQueryInputFormat.setDelegateInputFormat(mockInputFormat);
    // Run getSplits method.
    List<InputSplit> splits = gsonBigQueryInputFormat.getSplits(jobContext);
    replacedertThat(splits).hreplacedize(1);
    replacedertThat(((FileSplit) splits.get(0)).getPath()).isEqualTo(split.getPath());
    replacedertThat(config.get("mapreduce.input.fileinputformat.inputdir")).isEqualTo("gs://foo-bucket/bar.json");
    verify(mockBigQueryHelper, times(1)).getTable(eq(tableRef));
    verifyNoMoreInteractions(mockBigquery);
}

16 View Complete Implementation : BAMInputFormat.java
Copyright MIT License
Author : HadoopGenomics
// Handles all the splits that share the Path of the one at index i,
// returning the next index to be used.
private int addBAISplits(List<InputSplit> splits, int i, List<InputSplit> newSplits, Configuration conf) throws IOException {
    int splitsEnd = i;
    final Path path = ((FileSplit) splits.get(i)).getPath();
    final Path baiPath = getBAIPath(path);
    final FileSystem fs = path.getFileSystem(conf);
    final Path sinPath;
    if (fs.exists(baiPath)) {
        sinPath = baiPath;
    } else {
        sinPath = new Path(path.toString().replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix));
    }
    try (final FSDataInputStream in = fs.open(path);
        final SeekableStream guesserSin = WrapSeekable.openPath(fs, path);
        final SeekableStream sin = WrapSeekable.openPath(fs, sinPath)) {
        SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
        SAMSequenceDictionary dict = header.getSequenceDictionary();
        final BAMSplitGuesser guesser = new BAMSplitGuesser(guesserSin, conf);
        final LinearBAMIndex idx = new LinearBAMIndex(sin, dict);
        // searches for the first contig that contains linear bins
        // a contig will have no linear bins if there are no reads mapped to that
        // contig (e.g., reads were aligned to a whole genome, and then reads from
        // only a single contig were selected)
        int ctgIdx = -1;
        int bin = 0;
        LinearIndex linIdx;
        int ctgBins;
        long lastStart = 0;
        do {
            ctgIdx++;
            linIdx = idx.getLinearIndex(ctgIdx);
            ctgBins = linIdx.size();
        } while (ctgBins == 0);
        long nextStart = linIdx.get(bin);
        FileVirtualSplit newSplit = null;
        boolean lastWasGuessed = false;
        // loop and process all of the splits that share a single .bai
        while (splitsEnd < splits.size() && ((FileSplit) (splits.get(splitsEnd))).getPath() == path) {
            FileSplit fSplit = (FileSplit) splits.get(splitsEnd);
            splitsEnd++;
            if (splitsEnd >= splits.size()) {
                break;
            }
            long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16;
            lastStart = nextStart;
            // we need to advance and find the first linear index bin
            // that starts after the current split ends.
            // this is the end of our split.
            while (nextStart < fSplitEnd && ctgIdx < dict.size()) {
                // are we going off of the end of this contig?
                // if so, advance to the next contig with a linear bin
                if (bin + 1 >= ctgBins) {
                    do {
                        ctgIdx += 1;
                        bin = 0;
                        if (ctgIdx >= dict.size()) {
                            break;
                        }
                        linIdx = idx.getLinearIndex(ctgIdx);
                        ctgBins = linIdx.size();
                    } while (ctgBins == 0);
                }
                if (ctgIdx < dict.size() && linIdx.size() > bin) {
                    nextStart = linIdx.get(bin);
                    bin++;
                }
            }
            // is this the first split?
            // if so, split ranges from where the reads start until the identified end
            if (fSplit.getStart() == 0) {
                try (final SeekableStream inFile = WrapSeekable.openPath(path.getFileSystem(conf), path)) {
                    SamReader open = SamReaderFactory.makeDefault().setUseAsyncIo(false).open(SamInputResource.of(inFile));
                    SAMFileSpan span = open.indexing().getFilePointerSpanningReads();
                    long bamStart = ((BAMFileSpan) span).getFirstOffset();
                    newSplit = new FileVirtualSplit(fSplit.getPath(), bamStart, nextStart - 1, fSplit.getLocations());
                    newSplits.add(newSplit);
                }
            } else {
                // did we find any blocks that started in the last split?
                // if yes, then we're fine
                // if no, then we need to guess a split start (in the else clause)
                if (lastStart != nextStart) {
                    if (lastWasGuessed) {
                        newSplit.setEndVirtualOffset(lastStart - 1);
                        lastWasGuessed = false;
                    }
                    newSplit = new FileVirtualSplit(fSplit.getPath(), lastStart, nextStart - 1, fSplit.getLocations());
                    newSplits.add(newSplit);
                } else {
                    // guess the start
                    long alignedBeg = guesser.guessNextBAMRecordStart(fSplit.getStart(), fSplit.getStart() + fSplit.getLength());
                    newSplit.setEndVirtualOffset(alignedBeg - 1);
                    lastStart = alignedBeg;
                    nextStart = alignedBeg;
                    newSplit = new FileVirtualSplit(fSplit.getPath(), alignedBeg, alignedBeg + 1, fSplit.getLocations());
                    lastWasGuessed = true;
                    newSplits.add(newSplit);
                }
            }
            lastStart = nextStart;
        }
        // clean up the last split
        if (splitsEnd == splits.size()) {
            if (lastWasGuessed) {
                newSplit.setEndVirtualOffset(lastStart - 1);
                lastWasGuessed = false;
            }
            FileSplit fSplit = (FileSplit) splits.get(splitsEnd - 1);
            long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16;
            newSplit = new FileVirtualSplit(fSplit.getPath(), lastStart, fSplitEnd, fSplit.getLocations());
            newSplits.add(newSplit);
        }
    }
    return splitsEnd + 1;
}

16 View Complete Implementation : TemporaryInputFormatTest.java
Copyright Apache License 2.0
Author : asakusafw
/**
 * computing splits with aligned blocks plus.
 */
@Test
public void splits_aligned_rest() {
    BlockMap blocks = blocks("testing", TemporaryFile.BLOCK_SIZE, TemporaryFile.BLOCK_SIZE + 10);
    List<FileSplit> splits = TemporaryInputFormat.computeSplits(new Path("testing"), blocks, m(64));
    replacedertThat(splits, hreplacedize(2));
    FileSplit s0 = find(splits, 0);
    replacedertThat(s0.getLength(), is((long) TemporaryFile.BLOCK_SIZE));
    FileSplit s1 = find(splits, TemporaryFile.BLOCK_SIZE);
    replacedertThat(s1.getLength(), is((long) TemporaryFile.BLOCK_SIZE + 10));
}