org.apache.spark.api.java.JavaSparkContext - java examples

Here are the examples of the java api org.apache.spark.api.java.JavaSparkContext taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

155 Examples 7

19 View Complete Implementation : GATKSparkTool.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Initialize standard tool inputs.
 */
private void initializeToolInputs(final JavaSparkContext sparkContext) {
    initializeReference();
    // reference must be initialized before reads
    initializeReads(sparkContext);
    initializeFeatures();
    initializeIntervals();
}

19 View Complete Implementation : StructuralVariationDiscoveryPipelineSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext ctx, final SAMFileHeader headerForReads, final FindBreakpointEvidenceSpark.replacedembledEvidenceResults replacedembledEvidenceResults) {
    final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast = broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile);
    try {
        if (!java.nio.file.Files.exists(Paths.get(variantsOutDir))) {
            IOUtils.createDirectory(variantsOutDir);
        }
    } catch (final IOException ioex) {
        throw new GATKException("Failed to create output directory " + variantsOutDir + " though it does not yet exist", ioex);
    }
    final String outputPrefixWithSampleName = variantsOutDir + (variantsOutDir.endsWith("/") ? "" : "/") + SVUtils.getSampleId(headerForReads) + "_";
    return new SvDiscoveryInputMetaData(ctx, discoverStageArgs, evidenceAndreplacedemblyArgs.crossContigsToIgnoreFile, outputPrefixWithSampleName, replacedembledEvidenceResults.getReadMetadata(), replacedembledEvidenceResults.getreplacedembledIntervals(), makeEvidenceLinkTree(replacedembledEvidenceResults.getEvidenceTargetLinks()), cnvCallsBroadcast, getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
}

19 View Complete Implementation : SparkSharder.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Join an RDD of locatables with a set of intervals, and apply a function to process the locatables that overlap each interval.
 * This differs from {@link #joinOverlapping(JavaSparkContext, JavaRDD, Clreplaced, SAMSequenceDictionary, List, int, MapFunction)}
 * in that the function to apply is given two iterators: one over intervals, and one over locatables (for the parreplacedion),
 * and it is up to the function implemention to find overlaps between intervals and locatables.
 * @param ctx the Spark Context
 * @param locatables the locatables RDD, must be coordinate sorted
 * @param locatableClreplaced the clreplaced of the locatables, must be a subclreplaced of {@link Locatable}
 * @param sequenceDictionary the sequence dictionary to use to find contig lengths
 * @param intervals the collection of intervals to apply the function to
 * @param maxLocatableLength the maximum length of a {@link Locatable}, if any is larger than this size then an exception will be thrown
 * @param f the function to process intervals and overlapping locatables with
 * @param <L> the {@link Locatable} type
 * @param <I> the interval type
 * @param <T> the return type of <code>f</code>
 * @return
 */
private static <L extends Locatable, I extends Locatable, T> JavaRDD<T> joinOverlapping(JavaSparkContext ctx, JavaRDD<L> locatables, Clreplaced<L> locatableClreplaced, SAMSequenceDictionary sequenceDictionary, List<I> intervals, int maxLocatableLength, FlatMapFunction2<Iterator<L>, Iterator<I>, T> f) {
    return joinOverlapping(ctx, locatables, locatableClreplaced, sequenceDictionary, ctx.parallelize(intervals), maxLocatableLength, f);
}

19 View Complete Implementation : ReadsSparkSink.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * writeReads writes rddReads to outputFile with header as the file header.
 * @param ctx the JavaSparkContext to write.
 * @param outputFile path to the output bam.
 * @param referenceFile path to the reference. required for cram output, otherwise may be null.
 * @param reads reads to write.
 * @param header the header to put at the top of the files
 * @param format should the output be a single file, sharded, ADAM, etc.
 * @param numReducers the number of reducers to use when writing a single file. A value of zero indicates that the default
 *                    should be used.
 * @param outputPartsDir directory for temporary files for SINGLE output format, should be null for default value of filename + .output
 * @param sortReadsToHeader if true, the writer will perform a sort of reads according to the sort order of the header before writing
 */
public static void writeReads(final JavaSparkContext ctx, final String outputFile, final String referenceFile, final JavaRDD<GATKRead> reads, final SAMFileHeader header, ReadsWriteFormat format, final int numReducers, final String outputPartsDir, final boolean sortReadsToHeader) throws IOException {
    writeReads(ctx, outputFile, referenceFile, reads, header, format, numReducers, outputPartsDir, true, true, sortReadsToHeader);
}

19 View Complete Implementation : GATKSparkTool.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Initializes our reads source (but does not yet load the reads into a {@link JavaRDD}).
 * Does nothing if no reads inputs are present.
 */
private void initializeReads(final JavaSparkContext sparkContext) {
    if (readArguments.getReadFilesNames().isEmpty()) {
        return;
    }
    if (getReadInputMergingPolicy() == ReadInputMergingPolicy.doNotMerge && readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("Sorry, we only support a single reads input for for this spark tool.");
    }
    readInputs = new LinkedHashMap<>();
    readsSource = new ReadsSparkSource(sparkContext, readArguments.getReadValidationStringency());
    for (String input : readArguments.getReadFilesNames()) {
        readInputs.put(input, readsSource.getHeader(input, hasReference() ? referenceArguments.getReferenceFileName() : null));
    }
    readsHeader = createHeaderMerger().getMergedHeader();
}

19 View Complete Implementation : HoodieTable.java
Copyright Apache License 2.0
Author : apache
/**
 * Finalize the written data onto storage. Perform any final cleanups.
 *
 * @param jsc Spark Context
 * @param stats List of HoodieWriteStats
 * @throws HoodieIOException if some paths can't be finalized on storage
 */
public void finalizeWrite(JavaSparkContext jsc, String instantTs, List<HoodieWriteStat> stats) throws HoodieIOException {
    cleanFailedWrites(jsc, instantTs, stats, config.getConsistencyGuardConfig().isConsistencyCheckEnabled());
}

19 View Complete Implementation : SparkSharder.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
private static <L extends Locatable, SB extends ShardBoundary> JavaRDD<Shard<L>> shard(JavaSparkContext ctx, JavaRDD<L> locatables, Clreplaced<L> locatableClreplaced, SAMSequenceDictionary sequenceDictionary, JavaRDD<SB> intervals, int maxLocatableLength, boolean useShuffle) {
    JavaRDD<ShardBoundary> paddedIntervals = intervals.map(ShardBoundary::paddedShardBoundary);
    if (useShuffle) {
        throw new UnsupportedOperationException("Shuffle not supported when sharding an RDD of intervals.");
    }
    return joinOverlapping(ctx, locatables, locatableClreplaced, sequenceDictionary, paddedIntervals, maxLocatableLength, new MapFunction<Tuple2<ShardBoundary, Iterable<L>>, Shard<L>>() {

        private static final long serialVersionUID = 1L;

        @Override
        public Shard<L> call(Tuple2<ShardBoundary, Iterable<L>> value) {
            return value._1().createShard(value._2());
        }
    });
}

19 View Complete Implementation : HDFSParquetImporter.java
Copyright Apache License 2.0
Author : apache
public static void main(String[] args) throws Exception {
    final Config cfg = new Config();
    JCommander cmd = new JCommander(cfg, null, args);
    if (cfg.help || args.length == 0) {
        cmd.usage();
        System.exit(1);
    }
    HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
    JavaSparkContext jssc = UtilHelpers.buildSparkContext("data-importer-" + cfg.tableName, cfg.sparkMaster, cfg.sparkMemory);
    try {
        dataImporter.dataImport(jssc, cfg.retry);
    } finally {
        jssc.stop();
    }
}

19 View Complete Implementation : HaplotypeCallerSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
@Override
protected void processreplacedemblyRegions(JavaRDD<replacedemblyRegionWalkerContext> rdd, JavaSparkContext ctx) {
    processreplacedemblyRegions(rdd, ctx, getHeaderForReads(), referenceArguments.getReferenceFileName(), hcArgs, output, makeVariantAnnotations(), logger, createOutputVariantIndex);
}

19 View Complete Implementation : PSBwaAlignerSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Wrapper clreplaced for using the PathSeq Bwa aligner clreplaced in Spark. Encapsulates closing the index when done.
 */
public final clreplaced PSBwaAlignerSpark implements AutoCloseable {

    final PSBwaArgumentCollection bwaArgs;

    private final JavaSparkContext ctx;

    public PSBwaAlignerSpark(final JavaSparkContext ctx, final PSBwaArgumentCollection bwaArgs) {
        this.ctx = ctx;
        this.bwaArgs = bwaArgs;
    }

    public JavaRDD<GATKRead> doBwaAlignment(final JavaRDD<GATKRead> reads, final boolean pairedAlignment, final Broadcast<SAMFileHeader> header) {
        final PSBwaArgumentCollection bwaArgsLocal = bwaArgs;
        return reads.mapParreplacedions(itr -> (new PSBwaAligner(bwaArgsLocal, pairedAlignment)).apply(itr, header.value()));
    }

    // Run this after invoking a Spark action on all RDDs returned from doBwaAlignment()
    public void close() {
        BwaMemIndexCache.closeAllDistributedInstances(ctx);
    }
}

19 View Complete Implementation : HoodieTable.java
Copyright Apache License 2.0
Author : apache
/**
 * Ensures all files preplaceded either appear or disappear.
 *
 * @param jsc JavaSparkContext
 * @param groupByParreplacedion Files grouped by parreplacedion
 * @param visibility Appear/Disappear
 */
private void waitForAllFiles(JavaSparkContext jsc, Map<String, List<Pair<String, String>>> groupByParreplacedion, FileVisibility visibility) {
    // This will either ensure all files to be deleted are present.
    boolean checkPreplaceded = jsc.parallelize(new ArrayList<>(groupByParreplacedion.entrySet()), config.getFinalizeWriteParallelism()).map(parreplacedionWithFileList -> waitForCondition(parreplacedionWithFileList.getKey(), parreplacedionWithFileList.getValue().stream(), visibility)).collect().stream().allMatch(x -> x);
    if (!checkPreplaceded) {
        throw new HoodieIOException("Consistency check failed to ensure all files " + visibility);
    }
}

19 View Complete Implementation : TimelineServerPerf.java
Copyright Apache License 2.0
Author : apache
public void run() throws IOException {
    List<String> allParreplacedionPaths = FSUtils.getAllParreplacedionPaths(timelineServer.getFs(), cfg.basePath, true);
    Collections.shuffle(allParreplacedionPaths);
    List<String> selected = allParreplacedionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxParreplacedions).collect(Collectors.toList());
    JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster);
    if (!useExternalTimelineServer) {
        this.timelineServer.startService();
        setHostAddrFromSparkConf(jsc.getConf());
    } else {
        this.hostAddr = cfg.serverHost;
    }
    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(timelineServer.getConf(), cfg.basePath, true);
    SyncableFileSystemView fsView = new RemoteHoodieTableFileSystemView(this.hostAddr, cfg.serverPort, metaClient);
    String reportDir = cfg.reportDir;
    metaClient.getFs().mkdirs(new Path(reportDir));
    String dumpPrefix = UUID.randomUUID().toString();
    System.out.println("First Iteration to load all parreplacedions");
    Dumper d = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("1_%s.csv", dumpPrefix)));
    d.init();
    d.dump(runLookups(jsc, selected, fsView, 1, 0));
    d.close();
    System.out.println("\n\n\n First Iteration is done");
    Dumper d2 = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("2_%s.csv", dumpPrefix)));
    d2.init();
    d2.dump(runLookups(jsc, selected, fsView, cfg.numIterations, cfg.numCoresPerExecutor));
    d2.close();
    System.out.println("\n\n\nDumping all File Slices");
    selected.stream().forEach(p -> fsView.getAllFileSlices(p).forEach(s -> System.out.println("\tMyFileSlice=" + s)));
    // Waiting for curl queries
    if (!useExternalTimelineServer && cfg.waitForManualQueries) {
        System.out.println("Timeline Server Host Address=" + hostAddr + ", port=" + timelineServer.getServerPort());
        while (true) {
            try {
                Thread.sleep(60000);
            } catch (InterruptedException e) {
            // skip it
            }
        }
    }
}

19 View Complete Implementation : HoodieTable.java
Copyright Apache License 2.0
Author : apache
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(HoodieTableMetaClient metaClient, HoodieWriteConfig config, JavaSparkContext jsc) {
    switch(metaClient.getTableType()) {
        case COPY_ON_WRITE:
            return new HoodieCopyOnWriteTable<>(config, jsc);
        case MERGE_ON_READ:
            return new HoodieMergeOnReadTable<>(config, jsc);
        default:
            throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
    }
}

19 View Complete Implementation : PathSeqBwaSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Writes RDD of reads to path. Note writeReads() is not used because there are separate paired/unpaired outputs.
 * Header sequence dictionary is reduced to only those that were aligned to.
 */
private void writeBam(final JavaRDD<GATKRead> reads, final String inputBamPath, final boolean isPaired, final JavaSparkContext ctx, SAMFileHeader header) {
    // Only retain header sequences that were aligned to.
    // This invokes an action and therefore the reads must be cached.
    reads.persist(StorageLevel.MEMORY_AND_DISK_SER());
    header = PSBwaUtils.removeUnmappedHeaderSequences(header, reads, logger);
    final String outputPath = isPaired ? outputPaired : outputUnpaired;
    try {
        ReadsSparkSink.writeReads(ctx, outputPath, bwaArgs.referencePath, reads, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetParreplacedionSize()), shardedPartsDir, true);
    } catch (final IOException e) {
        throw new UserException.CouldNotCreateOutputFile(outputPath, "Writing failed", e);
    }
}

19 View Complete Implementation : HoodieBloomIndex.java
Copyright Apache License 2.0
Author : apache
/**
 * Returns an RDD mapping each HoodieKey with a parreplacedionPath/fileID which contains it. Option.Empty if the key is not
 * found.
 *
 * @param hoodieKeys keys to lookup
 * @param jsc spark context
 * @param hoodieTable hoodie table object
 */
@Override
public JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
    JavaPairRDD<String, String> parreplacedionRecordKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key.getParreplacedionPath(), key.getRecordKey()));
    // Lookup indexes for all the parreplacedion/recordkey pair
    JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD = lookupIndex(parreplacedionRecordKeyPairRDD, jsc, hoodieTable);
    JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
    return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
        Option<Pair<String, String>> parreplacedionPathFileidPair;
        if (keyLoc._2._2.isPresent()) {
            parreplacedionPathFileidPair = Option.of(Pair.of(keyLoc._1().getParreplacedionPath(), keyLoc._2._2.get().getFileId()));
        } else {
            parreplacedionPathFileidPair = Option.empty();
        }
        return new Tuple2<>(keyLoc._1, parreplacedionPathFileidPair);
    });
}

19 View Complete Implementation : HoodieSnapshotCopier.java
Copyright Apache License 2.0
Author : apache
public static void main(String[] args) throws IOException {
    // Take input configs
    final Config cfg = new Config();
    new JCommander(cfg, null, args);
    LOG.info(String.format("Snapshot hoodie table from %s targetBasePath to %stargetBasePath", cfg.basePath, cfg.outputPath));
    // Create a spark job to do the snapshot copy
    SparkConf sparkConf = new SparkConf().setAppName("Hoodie-snapshot-copier");
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    LOG.info("Initializing spark job.");
    // Copy
    HoodieSnapshotCopier copier = new HoodieSnapshotCopier();
    copier.snapshot(jsc, cfg.basePath, cfg.outputPath, cfg.shouldreplacedumeDateParreplacedioning);
    // Stop the job
    jsc.stop();
}

19 View Complete Implementation : UtilHelpers.java
Copyright Apache License 2.0
Author : apache
public static int handleErrors(JavaSparkContext jsc, String instantTime, JavaRDD<WriteStatus> writeResponse) {
    Acreplacedulator<Integer> errors = jsc.acreplacedulator(0);
    writeResponse.foreach(writeStatus -> {
        if (writeStatus.hasErrors()) {
            errors.add(1);
            LOG.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString()));
        }
    });
    if (errors.value() == 0) {
        LOG.info(String.format("Table imported into hoodie with %s instant time.", instantTime));
        return 0;
    }
    LOG.error(String.format("Import failed with %d errors.", errors.value()));
    return -1;
}

19 View Complete Implementation : BwaMemIndexCache.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Closes all instances in all the VMs involved in the spark context provided.
 * @param ctx the spark context.
 */
public static void closeAllDistributedInstances(final JavaSparkContext ctx) {
    Utils.nonNull(ctx, "the context provided cannot be null");
    int nJobs = ctx.defaultParallelism();
    final List<Integer> jobList = new ArrayList<>(nJobs);
    for (int idx = 0; idx != nJobs; ++idx) jobList.add(idx);
    ctx.parallelize(jobList, nJobs).foreach(idx -> closeInstances());
}

19 View Complete Implementation : MarkDuplicatesSparkUtilsUnitTest.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
// This helper method is used to generate groups reads that will be duplicate marked. It does this by generating numDuplicatesPerGroup
// pairs of reads starting at randomly selected starting locations. The start locations are random so that if the resulting RDD is
// coordinate sorted that it is more or less guaranteed that a large portion of the reads will reside on separate parreplacedions from
// their mates. It also handles sorting of the reads into either queryname or coordinate orders.
private JavaRDD<GATKRead> generateReadsWithDuplicates(int numReadGroups, int numDuplicatesPerGroup, JavaSparkContext ctx, int numParreplacedions, boolean coordinate) {
    int readNameCounter = 0;
    SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate, true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY);
    Random rand = new Random(10);
    for (int i = 0; i < numReadGroups; i++) {
        int start1 = rand.nextInt(SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH);
        int start2 = rand.nextInt(SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH);
        for (int j = 0; j < numDuplicatesPerGroup; j++) {
            samRecordSetBuilder.addPair("READ" + readNameCounter++, 0, start1, start2);
        }
    }
    List<SAMRecord> records = Lists.newArrayList(samRecordSetBuilder.getRecords());
    if (coordinate) {
        records.sort(new SAMRecordCoordinateComparator());
    } else {
        records.sort(new SAMRecordQueryNameComparator());
    }
    return ctx.parallelize(records, numParreplacedions).map(SAMRecordToGATKReadAdapter::new);
}

19 View Complete Implementation : HaplotypeCallerSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
private static Broadcast<Supplier<replacedemblyRegionEvaluator>> replacedemblyRegionEvaluatorSupplierBroadcastFunction(final JavaSparkContext ctx, final HaplotypeCallerArgumentCollection hcArgs, final SAMFileHeader header, final ReferenceSequenceFile taskReferenceSequenceFile, final VariantAnnotatorEngine annotatorEngine) {
    Supplier<replacedemblyRegionEvaluator> supplier = new Supplier<replacedemblyRegionEvaluator>() {

        @Override
        public replacedemblyRegionEvaluator get() {
            return new HaplotypeCallerEngine(hcArgs, false, false, header, taskReferenceSequenceFile, annotatorEngine);
        }
    };
    return ctx.broadcast(supplier);
}

19 View Complete Implementation : SparkMain.java
Copyright Apache License 2.0
Author : apache
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath) throws Exception {
    HoodieWriteClient client = createHoodieClient(jsc, basePath);
    if (client.rollbackToSavepoint(savepointTime)) {
        LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
        return 0;
    } else {
        LOG.info(String.format("The commit \"%s\" failed to roll back.", savepointTime));
        return -1;
    }
}

19 View Complete Implementation : GATKSparkTool.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Writes the reads from a {@link JavaRDD} to an output file.
 * @param ctx the JavaSparkContext to write.
 * @param outputFile path to the output bam/cram.
 * @param reads reads to write.
 * @param header the header to write.
 */
public void writeReads(final JavaSparkContext ctx, final String outputFile, JavaRDD<GATKRead> reads, SAMFileHeader header, final boolean sortReadsToHeader) {
    try {
        ReadsSparkSink.writeReads(ctx, outputFile, hasReference() ? referenceArguments.getReferencePath().toAbsolutePath().toUri().toString() : null, reads, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, getRecommendedNumReducers(), shardedPartsDir, createOutputBamIndex, createOutputBamSplittingIndex, sortReadsToHeader);
    } catch (IOException e) {
        throw new UserException.CouldNotCreateOutputFile(outputFile, "writing failed", e);
    }
}

19 View Complete Implementation : SparkSharder.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Create an RDD of {@link Shard} from an RDD of coordinate sorted {@link Locatable} <i>without using a shuffle</i>,
 * and where the intervals for shards are specified as an RDD, rather than a list.
 * Each shard contains the {@link Locatable} objects that overlap it (including overlapping only padding).
 * @param ctx the Spark Context
 * @param locatables the RDD of {@link Locatable}, must be coordinate sorted
 * @param locatableClreplaced the clreplaced of the {@link Locatable} objects in the RDD
 * @param sequenceDictionary the sequence dictionary to use to find contig lengths
 * @param intervals the {@link ShardBoundary} objects to create shards for, must be coordinate sorted
 * @param maxLocatableLength the maximum length of a {@link Locatable}, if any is larger than this size then an exception will be thrown
 * @param <L> the {@link Locatable} type
 * @param <SB> the {@link ShardBoundary} type
 * @return an RDD of {@link Shard} of overlapping {@link Locatable} objects (including overlapping only padding)
 */
public static <L extends Locatable, SB extends ShardBoundary> JavaRDD<Shard<L>> shard(JavaSparkContext ctx, JavaRDD<L> locatables, Clreplaced<L> locatableClreplaced, SAMSequenceDictionary sequenceDictionary, JavaRDD<SB> intervals, int maxLocatableLength) {
    return shard(ctx, locatables, locatableClreplaced, sequenceDictionary, intervals, maxLocatableLength, false);
}

19 View Complete Implementation : SparkSharder.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Create an RDD of {@link Shard} from an RDD of coordinate sorted {@link Locatable} <i>without using a shuffle</i>.
 * Each shard contains the {@link Locatable} objects that overlap it (including overlapping only padding).
 * @param ctx the Spark Context
 * @param locatables the RDD of {@link Locatable}, must be coordinate sorted
 * @param locatableClreplaced the clreplaced of the {@link Locatable} objects in the RDD
 * @param sequenceDictionary the sequence dictionary to use to find contig lengths
 * @param intervals the {@link ShardBoundary} objects to create shards for, must be coordinate sorted
 * @param maxLocatableLength the maximum length of a {@link Locatable}, if any is larger than this size then an exception will be thrown
 * @param <L> the {@link Locatable} type
 * @param <SB> the {@link ShardBoundary} type
 * @return an RDD of {@link Shard} of overlapping {@link Locatable} objects (including overlapping only padding)
 */
public static <L extends Locatable, SB extends ShardBoundary> JavaRDD<Shard<L>> shard(JavaSparkContext ctx, JavaRDD<L> locatables, Clreplaced<L> locatableClreplaced, SAMSequenceDictionary sequenceDictionary, List<SB> intervals, int maxLocatableLength) {
    return shard(ctx, locatables, locatableClreplaced, sequenceDictionary, intervals, maxLocatableLength, false);
}

19 View Complete Implementation : HDFSParquetImporter.java
Copyright Apache License 2.0
Author : apache
public int dataImport(JavaSparkContext jsc, int retry) {
    this.fs = FSUtils.getFs(cfg.targetPath, jsc.hadoopConfiguration());
    this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) : UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
    LOG.info("Starting data import with configs : " + props.toString());
    int ret = -1;
    try {
        // Verify that targetPath is not present.
        if (fs.exists(new Path(cfg.targetPath))) {
            throw new HoodieIOException(String.format("Make sure %s is not present.", cfg.targetPath));
        }
        do {
            ret = dataImport(jsc);
        } while (ret != 0 && retry-- > 0);
    } catch (Throwable t) {
        LOG.error(t);
    }
    return ret;
}

19 View Complete Implementation : ReadsSparkSink.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * writeReads writes rddReads to outputFile with header as the file header.
 * @param ctx the JavaSparkContext to write.
 * @param outputFile path to the output bam.
 * @param referenceFile path to the reference. required for cram output, otherwise may be null.
 * @param reads reads to write.
 * @param header the header to put at the top of the files
 * @param format should the output be a single file, sharded, ADAM, etc.
 */
public static void writeReads(final JavaSparkContext ctx, final String outputFile, final String referenceFile, final JavaRDD<GATKRead> reads, final SAMFileHeader header, ReadsWriteFormat format) throws IOException {
    writeReads(ctx, outputFile, referenceFile, reads, header, format, 0, null, true);
}

19 View Complete Implementation : PathSeqBwaSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Loads a bam, aligns using the given aligner, and writes to a new bam. Returns false if the input bam could not
 * be read.
 */
private boolean alignBam(final String inputBamPath, final PSBwaAlignerSpark aligner, final boolean isPaired, final JavaSparkContext ctx, final ReadsSparkSource readsSource) {
    final Tuple2<SAMFileHeader, JavaRDD<GATKRead>> loadedBam = loadBam(inputBamPath, readsSource);
    if (loadedBam == null)
        return false;
    final SAMFileHeader header = loadedBam._1;
    final JavaRDD<GATKRead> reads = loadedBam._2;
    Utils.nonNull(header);
    Utils.nonNull(reads);
    if (isPaired && !header.getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) {
        throw new UserException.BadInput("Paired input BAM must be sorted by queryname");
    }
    final JavaRDD<GATKRead> alignedReads = aligner.doBwaAlignment(reads, isPaired, ctx.broadcast(header));
    writeBam(alignedReads, inputBamPath, isPaired, ctx, header);
    return true;
}

19 View Complete Implementation : InMemoryHashIndex.java
Copyright Apache License 2.0
Author : apache
@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
    return writeStatusRDD.map(new Function<WriteStatus, WriteStatus>() {

        @Override
        public WriteStatus call(WriteStatus writeStatus) {
            for (HoodieRecord record : writeStatus.getWrittenRecords()) {
                if (!writeStatus.isErrored(record.getKey())) {
                    HoodieKey key = record.getKey();
                    Option<HoodieRecordLocation> newLocation = record.getNewLocation();
                    if (newLocation.isPresent()) {
                        recordLocationMap.put(key, newLocation.get());
                    } else {
                        // Delete existing index for a deleted record
                        recordLocationMap.remove(key);
                    }
                }
            }
            return writeStatus;
        }
    });
}

19 View Complete Implementation : StructuralVariationDiscoveryPipelineSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
public static Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls(final JavaSparkContext ctx, final SAMFileHeader header, final String cnvCallsFile) {
    final SVIntervalTree<VariantContext> cnvCalls;
    if (cnvCallsFile != null) {
        cnvCalls = CNVInputReader.loadCNVCalls(cnvCallsFile, header);
    } else {
        cnvCalls = null;
    }
    final Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls;
    if (cnvCalls != null) {
        broadcastCNVCalls = ctx.broadcast(cnvCalls);
    } else {
        broadcastCNVCalls = null;
    }
    return broadcastCNVCalls;
}

19 View Complete Implementation : TestHDFSParquetImporter.java
Copyright Apache License 2.0
Author : apache
/**
 * Test for missing rowKey and parreplacedionKey.
 */
@Test
public void testRowAndParreplacedionKey() throws Exception {
    JavaSparkContext jsc = null;
    try {
        jsc = getJavaSparkContext();
        // Test root folder.
        String basePath = (new Path(dfsBasePath, Thread.currentThread().getStackTrace()[1].getMethodName())).toString();
        // Hoodie root folder
        Path hoodieFolder = new Path(basePath, "testTarget");
        // Create generic records.
        Path srcFolder = new Path(basePath, "testSrc");
        createRecords(srcFolder);
        // Create schema file.
        Path schemaFile = new Path(basePath.toString(), "missingFile.schema");
        createSchemaFile(schemaFile.toString());
        HDFSParquetImporter dataImporter;
        HDFSParquetImporter.Config cfg;
        // Check for invalid row key.
        cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "invalidRowKey", "timestamp", 1, schemaFile.toString());
        dataImporter = new HDFSParquetImporter(cfg);
        replacedertEquals(-1, dataImporter.dataImport(jsc, 0));
        // Check for invalid parreplacedion key.
        cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "invalidTimeStamp", 1, schemaFile.toString());
        dataImporter = new HDFSParquetImporter(cfg);
        replacedertEquals(-1, dataImporter.dataImport(jsc, 0));
    } finally {
        if (jsc != null) {
            jsc.stop();
        }
    }
}

19 View Complete Implementation : HBaseIndex.java
Copyright Apache License 2.0
Author : apache
@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
    final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
    setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc);
    LOG.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize);
    JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapParreplacedionsWithIndex(updateLocationFunction(), true);
    // caching the index updated status RDD
    writeStatusJavaRDD = writeStatusJavaRDD.persist(config.getWriteStatusStorageLevel());
    return writeStatusJavaRDD;
}

19 View Complete Implementation : HoodieMergeOnReadTable.java
Copyright Apache License 2.0
Author : apache
@Override
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) {
    LOG.info("Checking if compaction needs to be run on " + config.getBasePath());
    Option<HoodieInstant> lastCompaction = getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant();
    String deltaCommitsSinceTs = "0";
    if (lastCompaction.isPresent()) {
        deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
    }
    int deltaCommitsSinceLastCompaction = getActiveTimeline().getDeltaCommitTimeline().findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
    if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
        LOG.info("Not running compaction as only " + deltaCommitsSinceLastCompaction + " delta commits was found since last compaction " + deltaCommitsSinceTs + ". Waiting for " + config.getInlineCompactDeltaCommitMax());
        return new HoodieCompactionPlan();
    }
    LOG.info("Compacting merge on read table " + config.getBasePath());
    HoodieMergeOnReadTableCompactor compactor = new HoodieMergeOnReadTableCompactor();
    try {
        return compactor.generateCompactionPlan(jsc, this, config, instantTime, ((SyncableFileSystemView) getSliceView()).getPendingCompactionOperations().map(instantTimeCompactionopPair -> instantTimeCompactionopPair.getValue().getFileGroupId()).collect(Collectors.toSet()));
    } catch (IOException e) {
        throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e);
    }
}

19 View Complete Implementation : CreateReadCountPanelOfNormals.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
@Override
protected void runPipeline(final JavaSparkContext ctx) {
    if (!new HDF5Library().load(null)) {
        // Note: preplaceding null means using the default temp dir.
        throw new UserException.HardwareFeatureException("Cannot load the required HDF5 library. " + "HDF5 is currently supported on x86-64 architecture and Linux or OSX systems.");
    }
    validateArguments();
    // get sample filenames
    final List<String> sampleFilenames = inputReadCountFiles.stream().map(File::getAbsolutePath).collect(Collectors.toList());
    // get sequence dictionary and intervals from the first read-counts file to use to validate remaining files
    // (this first file is read again below, which is slightly inefficient but is probably not worth the extra code)
    final File firstReadCountFile = inputReadCountFiles.get(0);
    logger.info(String.format("Retrieving intervals from first read-counts file (%s)...", firstReadCountFile));
    final SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile);
    final SAMSequenceDictionary sequenceDictionary = firstReadCounts.getMetadata().getSequenceDictionary();
    final List<SimpleInterval> intervals = firstReadCounts.getIntervals();
    Utils.validateArg(firstReadCounts.size() <= maximumChunkSize, String.format("The number of intervals (%d) in each read-counts file cannot exceed the maximum chunk size (%d).", firstReadCounts.size(), maximumChunkSize));
    // get GC content (null if not provided)
    final AnnotatedIntervalCollection annotatedIntervals = CopyNumberArgumentValidationUtils.validateAnnotatedIntervals(inputAnnotatedIntervalsFile, firstReadCounts, logger);
    final double[] intervalGCContent = annotatedIntervals == null ? null : annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT)).toArray();
    // validate input read-counts files (i.e., check intervals and that only integer counts are contained)
    // and aggregate as a RealMatrix with dimensions numIntervals x numSamples
    final RealMatrix readCountMatrix = constructReadCountMatrix(logger, inputReadCountFiles, sequenceDictionary, intervals);
    // create the PoN
    logger.info("Creating the panel of normals...");
    HDF5SVDReadCountPanelOfNormals.create(outputPanelOfNormalsFile, getCommandLine(), sequenceDictionary, readCountMatrix, sampleFilenames, intervals, intervalGCContent, minimumIntervalMedianPercentile, maximumZerosInSamplePercentage, maximumZerosInIntervalPercentage, extremeSampleMedianPercentile, doImputeZeros, extremeOutlierTruncationPercentile, numEigensamplesRequested, maximumChunkSize, ctx);
    logger.info(String.format("%s complete.", getClreplaced().getSimpleName()));
}

19 View Complete Implementation : ReadsSparkSource.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Check that for CRAM the reference is set to a file that exists and is not 2bit.
 * @return the <code>referencePath</code> or <code>null</code> if not CRAM
 */
static String checkCramReference(final JavaSparkContext ctx, final String filePath, final String referencePath) {
    if (IOUtils.isCramFileName(filePath)) {
        if (referencePath == null) {
            throw new UserException.MissingReference("A reference is required for CRAM input");
        } else if (ReferenceTwoBitSparkSource.isTwoBit(referencePath)) {
            // htsjdk can't handle 2bit reference files
            throw new UserException("A 2bit file cannot be used as a CRAM file reference");
        } else {
            final Path refPath = new Path(referencePath);
            if (!SparkUtils.pathExists(ctx, refPath)) {
                throw new UserException.MissingReference("The specified fasta file (" + referencePath + ") does not exist.");
            }
        }
        return referencePath;
    }
    return null;
}

19 View Complete Implementation : PathSeqScoreSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
@Override
protected void runTool(final JavaSparkContext ctx) {
    if (!readArguments.getReadFiles().isEmpty()) {
        throw new UserException.BadInput("Please use --paired-input or --unpaired-input instead of --input");
    }
    final ReadsSparkSource readsSource = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency());
    // Load reads
    final Tuple2<JavaRDD<GATKRead>, SAMFileHeader> pairedData = readInputWithHeader(pairedInput, readsSource);
    final Tuple2<JavaRDD<GATKRead>, SAMFileHeader> unpairedData = readInputWithHeader(unpairedInput, readsSource);
    final JavaRDD<GATKRead> pairedReads = pairedData._1;
    final SAMFileHeader pairedHeader = pairedData._2;
    final JavaRDD<GATKRead> unpairedReads = unpairedData._1;
    final SAMFileHeader unpairedHeader = unpairedData._2;
    if (pairedHeader != null && !pairedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) {
        throw new UserException.BadInput("Paired input BAM must be sorted by queryname");
    }
    // Join header sequences and read groups
    final SAMFileHeader header = joinBamHeaders(pairedHeader, unpairedHeader);
    // Main tool routine
    final PSScorer scorer = new PSScorer(scoreArgs);
    final JavaRDD<GATKRead> readsFinal = scorer.scoreReads(ctx, pairedReads, unpairedReads, header);
    if (scoreArgs.scoreMetricsFileUri != null) {
        try (final PSScoreLogger scoreLogger = new PSScoreFileLogger(getMetricsFile(), scoreArgs.scoreMetricsFileUri)) {
            scoreLogger.logReadCounts(readsFinal);
        }
    }
    // Write reads to BAM, if specified
    // Note writeReads() is not used because we determine recommendedNumReducers differently with 2 input BAMs
    if (outputPath != null) {
        try {
            ReadsSparkSink.writeReads(ctx, outputPath, null, readsFinal, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, recommendedNumReducers, shardedPartsDir, true);
        } catch (final IOException e) {
            throw new UserException.CouldNotCreateOutputFile(outputPath, "writing failed", e);
        }
    }
}

19 View Complete Implementation : GATKSparkTool.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Register the reference file (and replacedociated dictionary and index) to be downloaded to every node using Spark's
 * copying mechanism ({@code SparkContext#addFile()}).
 * @param ctx the Spark context
 * @param referenceFile the reference file, can be a local file or a remote path
 * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()}
 */
protected static String addReferenceFilesForSpark(JavaSparkContext ctx, String referenceFile) {
    if (referenceFile == null) {
        return null;
    }
    Path referencePath = IOUtils.getPath(referenceFile);
    Path indexPath = ReferenceSequenceFileFactory.getFastaIndexFileName(referencePath);
    Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(referencePath);
    Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(referencePath);
    ctx.addFile(referenceFile);
    if (Files.exists(indexPath)) {
        ctx.addFile(indexPath.toUri().toString());
    }
    if (Files.exists(dictPath)) {
        ctx.addFile(dictPath.toUri().toString());
    }
    if (Files.exists(gziPath)) {
        ctx.addFile(gziPath.toUri().toString());
    }
    return referencePath.getFileName().toString();
}

19 View Complete Implementation : SchemaProvider.java
Copyright Apache License 2.0
Author : apache
/**
 * Clreplaced to provide schema for reading data and also writing into a Hoodie table.
 */
public abstract clreplaced SchemaProvider implements Serializable {

    protected TypedProperties config;

    protected JavaSparkContext jssc;

    protected SchemaProvider(TypedProperties props, JavaSparkContext jssc) {
        this.config = props;
        this.jssc = jssc;
    }

    public abstract Schema getSourceSchema();

    public Schema getTargetSchema() {
        // by default, use source schema as target for hoodie table as well
        return getSourceSchema();
    }
}

19 View Complete Implementation : HBaseIndex.java
Copyright Apache License 2.0
Author : apache
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD, HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, final JavaSparkContext jsc) {
    if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
        SparkConf conf = jsc.getConf();
        int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
        if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
            maxExecutors = Math.max(maxExecutors, conf.getInt(DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
        }
        /*
       * Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has
       * any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for
       * inserts from HBaseIndex.
       */
        final Tuple2<Long, Integer> numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD);
        final long numPuts = numPutsParallelismTuple._1;
        final int hbasePutsParallelism = numPutsParallelismTuple._2;
        this.numRegionServersForTable = getNumRegionServersAliveForTable();
        final float desiredQPSFraction = hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable);
        LOG.info("Desired QPSFraction :" + desiredQPSFraction);
        LOG.info("Number HBase puts :" + numPuts);
        LOG.info("Hbase Puts Parallelism :" + hbasePutsParallelism);
        final float availableQpsFraction = hBaseIndexQPSResourceAllocator.acquireQPSResources(desiredQPSFraction, numPuts);
        LOG.info("Allocated QPS Fraction :" + availableQpsFraction);
        multiPutBatchSize = putBatchSizeCalculator.getBatchSize(numRegionServersForTable, maxQpsPerRegionServer, hbasePutsParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, availableQpsFraction);
        LOG.info("multiPutBatchSize :" + multiPutBatchSize);
    }
}

19 View Complete Implementation : SparkContextFactory.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Stop a {@link JavaSparkContext}, unless it is the test context.
 *
 * @param context the context to stop
 */
public static synchronized void stopSparkContext(final JavaSparkContext context) {
    // only call stop for a non-test context
    if (context != testContext) {
        context.stop();
    }
}

19 View Complete Implementation : PathSeqBwaSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
@Override
protected void runTool(final JavaSparkContext ctx) {
    if (!readArguments.getReadFiles().isEmpty()) {
        throw new UserException.BadInput("Please use --paired-input or --unpaired-input instead of --input");
    }
    final ReadsSparkSource readsSource = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency());
    final PSBwaAlignerSpark aligner = new PSBwaAlignerSpark(ctx, bwaArgs);
    boolean bPairedSuccess = alignBam(inputPaired, aligner, true, ctx, readsSource);
    boolean bUnpairedSuccess = alignBam(inputUnpaired, aligner, false, ctx, readsSource);
    if (!bPairedSuccess && !bUnpairedSuccess) {
        throw new UserException.BadInput("No reads were loaded. Ensure --paired-input and/or --unpaired-input are set and valid.");
    }
    aligner.close();
}

19 View Complete Implementation : HaplotypeCallerSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Call Variants using HaplotypeCaller on Spark and write out a VCF file.
 *
 * This may be called from any spark pipeline in order to call variants from an RDD of GATKRead
 * @param ctx the spark context
 * @param reads the reads variants should be called from
 * @param header the header that goes with the reads
 * @param reference the full path to the reference file (must have been added via {@code SparkContext#addFile()})
 * @param intervalShards the interval shards to restrict calling to
 * @param hcArgs haplotype caller arguments
 * @param shardingArgs arguments to control how the replacedembly regions are sharded
 * @param output the output path for the VCF
 * @param logger
 * @param strict whether to use the strict implementation (slower) for finding replacedembly regions to match walker version
 * @param createOutputVariantIndex create a variant index (tabix for bgzipped VCF only)
 */
public static void callVariantsWithHaplotypeCallerAndWriteOutput(final JavaSparkContext ctx, final JavaRDD<GATKRead> reads, final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary, final String reference, final List<ShardBoundary> intervalShards, final HaplotypeCallerArgumentCollection hcArgs, final replacedemblyRegionReadShardArgumentCollection shardingArgs, final replacedemblyRegionArgumentCollection replacedemblyRegionArgs, final boolean includeReadsWithDeletionsInIsActivePileups, final String output, final Collection<Annotation> annotations, final Logger logger, final boolean strict, final boolean createOutputVariantIndex) {
    final Path referencePath = IOUtils.getPath(reference);
    final String referenceFileName = referencePath.getFileName().toString();
    Broadcast<Supplier<replacedemblyRegionEvaluator>> replacedemblyRegionEvaluatorSupplierBroadcast = replacedemblyRegionEvaluatorSupplierBroadcast(ctx, hcArgs, header, reference, annotations);
    JavaRDD<replacedemblyRegionWalkerContext> replacedemblyRegions = strict ? FindreplacedemblyRegionsSpark.getreplacedemblyRegionsStrict(ctx, reads, header, sequenceDictionary, referenceFileName, null, intervalShards, replacedemblyRegionEvaluatorSupplierBroadcast, shardingArgs, replacedemblyRegionArgs, includeReadsWithDeletionsInIsActivePileups, false) : FindreplacedemblyRegionsSpark.getreplacedemblyRegionsFast(ctx, reads, header, sequenceDictionary, referenceFileName, null, intervalShards, replacedemblyRegionEvaluatorSupplierBroadcast, shardingArgs, replacedemblyRegionArgs, includeReadsWithDeletionsInIsActivePileups, false);
    processreplacedemblyRegions(replacedemblyRegions, ctx, header, reference, hcArgs, output, annotations, logger, createOutputVariantIndex);
}

19 View Complete Implementation : SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
@Override
protected void runTool(final JavaSparkContext ctx) {
    validateParams();
    final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast = StructuralVariationDiscoveryPipelineSpark.broadcastCNVCalls(ctx, getHeaderForReads(), discoverStageArgs.cnvCallsFile);
    final String outputPrefixWithSampleName = getOutputPrefix();
    final SvDiscoveryInputMetaData svDiscoveryInputMetaData = new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, outputPrefixWithSampleName, null, null, null, cnvCallsBroadcast, getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
    final JavaRDD<GATKRead> replacedemblyRawAlignments = getReads();
    final replacedemblyContigsClreplacedifiedByAlignmentSignatures contigsByPossibleRawTypes = preprocess(svDiscoveryInputMetaData, replacedemblyRawAlignments);
    final List<VariantContext> variants = dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, replacedemblyRawAlignments, writeSAMFiles);
    contigsByPossibleRawTypes.unpersist();
    filterAndWriteMergedVCF(outputPrefixWithSampleName, variants, svDiscoveryInputMetaData);
}

19 View Complete Implementation : HaplotypeCallerSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
@Override
protected void runTool(JavaSparkContext ctx) {
    // TODO remove me when https://github.com/broadinsreplacedute/gatk/issues/4303 are fixed
    if (output.endsWith(FileExtensions.BCF) || output.endsWith(FileExtensions.BCF + ".gz")) {
        throw new UserException.UnimplementedFeature("It is currently not possible to write a BCF file on spark.  See https://github.com/broadinsreplacedute/gatk/issues/4303 for more details .");
    }
    Utils.validateArg(hcArgs.dbsnp.dbsnp == null, "HaplotypeCallerSpark does not yet support -D or --dbsnp arguments");
    Utils.validateArg(hcArgs.comps.isEmpty(), "HaplotypeCallerSpark does not yet support -comp or --comp arguments");
    Utils.validateArg(hcArgs.bamOutputPath == null, "HaplotypeCallerSpark does not yet support -bamout or --bamOutput");
    Utils.validate(getHeaderForReads().getSortOrder() == SAMFileHeader.SortOrder.coordinate, "The reads must be coordinate sorted.");
    logger.info("********************************************************************************");
    logger.info("The output of this tool DOES NOT match the output of HaplotypeCaller. ");
    logger.info("It is under development and should not be used for production work. ");
    logger.info("For evaluation only.");
    logger.info("Use the non-spark HaplotypeCaller if you care about the results. ");
    logger.info("********************************************************************************");
    try {
        super.runTool(ctx);
    } catch (Exception e) {
        if (e.getCause() instanceof UserException) {
            throw (UserException) e.getCause();
        } else {
            throw e;
        }
    }
}

19 View Complete Implementation : SparkSharder.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
private static <L extends Locatable, I extends Locatable, T> JavaRDD<T> joinOverlapping(JavaSparkContext ctx, JavaRDD<L> locatables, Clreplaced<L> locatableClreplaced, SAMSequenceDictionary sequenceDictionary, JavaRDD<I> intervals, int maxLocatableLength, MapFunction<Tuple2<I, Iterable<L>>, T> f) {
    return joinOverlapping(ctx, locatables, locatableClreplaced, sequenceDictionary, intervals, maxLocatableLength, (FlatMapFunction2<Iterator<L>, Iterator<I>, T>) (locatablesIterator, shardsIterator) -> Iterators.transform(locatablesPerShard(locatablesIterator, shardsIterator, sequenceDictionary, maxLocatableLength), new Function<Tuple2<I, Iterable<L>>, T>() {

        @Nullable
        @Override
        public T apply(@Nullable Tuple2<I, Iterable<L>> input) {
            try {
                return f.call(input);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }
    }));
}

19 View Complete Implementation : HaplotypeCallerSpark.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
@Override
protected Broadcast<Supplier<replacedemblyRegionEvaluator>> replacedemblyRegionEvaluatorSupplierBroadcast(final JavaSparkContext ctx) {
    final Path referencePath = IOUtils.getPath(referenceArguments.getReferenceFileName());
    final String referenceFileName = referencePath.getFileName().toString();
    final String pathOnExecutor = SparkFiles.get(referenceFileName);
    final ReferenceSequenceFile taskReferenceSequenceFile = new CachingIndexedFastaSequenceFile(IOUtils.getPath(pathOnExecutor));
    final Collection<Annotation> annotations = makeVariantAnnotations();
    final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
    return replacedemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, getHeaderForReads(), taskReferenceSequenceFile, annotatorEngine);
}

19 View Complete Implementation : HoodieBloomIndex.java
Copyright Apache License 2.0
Author : apache
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
    // Step 0: cache the input record RDD
    if (config.getBloomIndexUseCaching()) {
        recordRDD.persist(config.getBloomIndexInputStorageLevel());
    }
    // Step 1: Extract out thinner JavaPairRDD of (parreplacedionPath, recordKey)
    JavaPairRDD<String, String> parreplacedionRecordKeyPairRDD = recordRDD.mapToPair(record -> new Tuple2<>(record.getParreplacedionPath(), record.getRecordKey()));
    // Lookup indexes for all the parreplacedion/recordkey pair
    JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD = lookupIndex(parreplacedionRecordKeyPairRDD, jsc, hoodieTable);
    // Cache the result, for subsequent stages.
    if (config.getBloomIndexUseCaching()) {
        keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
    }
    if (LOG.isDebugEnabled()) {
        long totalTaggedRecords = keyFilenamePairRDD.count();
        LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
    }
    // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
    // Cost: 4 sec.
    JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);
    if (config.getBloomIndexUseCaching()) {
        // unpersist the input Record RDD
        recordRDD.unpersist();
        keyFilenamePairRDD.unpersist();
    }
    return taggedRecordRDD;
}

19 View Complete Implementation : UtilHelpers.java
Copyright Apache License 2.0
Author : apache
/**
 * Build Hoodie write client.
 *
 * @param jsc Java Spark Context
 * @param basePath Base Path
 * @param schemaStr Schema
 * @param parallelism Parallelism
 */
public static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr, int parallelism, Option<String> compactionStrategyClreplaced, TypedProperties properties) throws Exception {
    HoodieCompactionConfig compactionConfig = compactionStrategyClreplaced.map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false).withCompactionStrategy(ReflectionUtils.loadClreplaced(strategy)).build()).orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withParallelism(parallelism, parallelism).withBulkInsertParallelism(parallelism).withSchema(schemaStr).combineInput(true, true).withCompactionConfig(compactionConfig).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withProps(properties).build();
    return new HoodieWriteClient(jsc, config);
}

19 View Complete Implementation : TestHDFSParquetImporter.java
Copyright Apache License 2.0
Author : apache
/**
 * Tests for scheme file. 1. File is missing. 2. File has invalid data.
 */
@Test
public void testSchemaFile() throws Exception {
    JavaSparkContext jsc = null;
    try {
        jsc = getJavaSparkContext();
        // Test root folder.
        String basePath = (new Path(dfsBasePath, Thread.currentThread().getStackTrace()[1].getMethodName())).toString();
        // Hoodie root folder
        Path hoodieFolder = new Path(basePath, "testTarget");
        Path srcFolder = new Path(basePath.toString(), "srcTest");
        Path schemaFile = new Path(basePath.toString(), "missingFile.schema");
        HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile.toString());
        HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
        // Should fail - return : -1.
        replacedertEquals(-1, dataImporter.dataImport(jsc, 0));
        dfs.create(schemaFile).write("Random invalid schema data".getBytes());
        // Should fail - return : -1.
        replacedertEquals(-1, dataImporter.dataImport(jsc, 0));
    } finally {
        if (jsc != null) {
            jsc.stop();
        }
    }
}

19 View Complete Implementation : SparkSharder.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Join an RDD of locatables with a set of intervals, and apply a function to process the locatables that overlap each interval.
 * @param ctx the Spark Context
 * @param locatables the locatables RDD, must be coordinate sorted
 * @param locatableClreplaced the clreplaced of the locatables, must be a subclreplaced of {@link Locatable}
 * @param sequenceDictionary the sequence dictionary to use to find contig lengths
 * @param intervals the collection of intervals to apply the function to
 * @param maxLocatableLength the maximum length of a {@link Locatable}, if any is larger than this size then an exception will be thrown
 * @param f the function to process intervals and overlapping locatables with
 * @param <L> the {@link Locatable} type
 * @param <I> the interval type
 * @param <T> the return type of <code>f</code>
 * @return
 */
private static <L extends Locatable, I extends Locatable, T> JavaRDD<T> joinOverlapping(JavaSparkContext ctx, JavaRDD<L> locatables, Clreplaced<L> locatableClreplaced, SAMSequenceDictionary sequenceDictionary, List<I> intervals, int maxLocatableLength, MapFunction<Tuple2<I, Iterable<L>>, T> f) {
    return joinOverlapping(ctx, locatables, locatableClreplaced, sequenceDictionary, intervals, maxLocatableLength, (FlatMapFunction2<Iterator<L>, Iterator<I>, T>) (locatablesIterator, shardsIterator) -> Iterators.transform(locatablesPerShard(locatablesIterator, shardsIterator, sequenceDictionary, maxLocatableLength), new Function<Tuple2<I, Iterable<L>>, T>() {

        @Nullable
        @Override
        public T apply(@Nullable Tuple2<I, Iterable<L>> input) {
            try {
                return f.call(input);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }
    }));
}

19 View Complete Implementation : SVDFactory.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : broadinstitute
/**
 * Create a SVD instance using a spark context.
 *
 * @param m matrix that is not {@code null}
 * @param ctx JavaSparkContext.  {@code null} is allowed, but will fall back to Apache Commons Math implementation.
 * @return SVD instance that is never {@code null}
 */
public static SVD createSVD(final RealMatrix m, final JavaSparkContext ctx) {
    Utils.nonNull(m, "Cannot create SVD from a null matrix.");
    if (ctx == null) {
        return new OjAlgoSingularValueDecomposer().createSVD(m);
    }
    return new SparkSingularValueDecomposer(ctx).createSVD(m);
}