org.apache.orc.TypeDescription - java examples

Here are the examples of the java api org.apache.orc.TypeDescription taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

143 Examples 7

19 View Complete Implementation : VectorColumnFiller.java
Copyright Apache License 2.0
Author : pinterest
public static JsonConverter createConverter(TypeDescription schema) {
    switch(schema.getCategory()) {
        case BYTE:
        case SHORT:
        case INT:
        case LONG:
            return new LongColumnConverter();
        case FLOAT:
        case DOUBLE:
            return new DoubleColumnConverter();
        case CHAR:
        case VARCHAR:
        case STRING:
            return new StringColumnConverter();
        case DECIMAL:
            return new DecimalColumnConverter();
        case TIMESTAMP:
            return new TimestampColumnConverter();
        case BINARY:
            return new BinaryColumnConverter();
        case BOOLEAN:
            return new BooleanColumnConverter();
        case STRUCT:
            return new StructColumnConverter(schema);
        case LIST:
            return new ListColumnConverter(schema);
        case MAP:
            return new MapColumnConverter(schema);
        case UNION:
            return new UnionColumnConverter(schema);
        default:
            throw new IllegalArgumentException("Unhandled type " + schema);
    }
}

19 View Complete Implementation : ORCRecordReader.java
Copyright Apache License 2.0
Author : apache
private void fillGenericRow(GenericRow genericRow, VectorizedRowBatch rowBatch) {
    // TODO: use Pinot schema to fill the values to handle missing column and default values properly
    // ORC's TypeDescription is the equivalent of a schema. The way we will support ORC in Pinot
    // will be to get the top level struct that contains all our fields and look through its
    // children to determine the fields in our schemas.
    if (_orcSchema.getCategory().equals(TypeDescription.Category.STRUCT)) {
        for (int i = 0; i < _orcSchema.getChildren().size(); i++) {
            // Get current column in schema
            TypeDescription currColumn = _orcSchema.getChildren().get(i);
            String currColumnName = _orcSchema.getFieldNames().get(i);
            if (!_pinotSchema.getColumnNames().contains(currColumnName)) {
                LOGGER.warn("Skipping column {} because it is not in pinot schema", currColumnName);
                continue;
            }
            // ORC will keep your columns in the same order as the schema provided
            ColumnVector vector = rowBatch.cols[i];
            // Previous value set to null, not used except to save allocation memory in OrcMapredRecordReader
            WritableComparable writableComparable;
            writableComparable = OrcMapredRecordReader.nextValue(vector, 0, currColumn, null);
            genericRow.putField(currColumnName, getBaseObject(writableComparable));
        }
    } else {
        throw new IllegalArgumentException("Not a valid schema");
    }
}

19 View Complete Implementation : OrcKeyComparator.java
Copyright Apache License 2.0
Author : apache
/**
 * Compare {@link OrcKey} in shuffle of MapReduce.
 * Delegate byte decoding to underlying {@link OrcStruct#readFields(DataInput)} method to simplify comparison.
 */
public clreplaced OrcKeyComparator extends Configured implements RawComparator<OrcKey> {

    private TypeDescription schema;

    private OrcKey key1;

    private OrcKey key2;

    private DataInputBuffer buffer;

    @Override
    public void setConf(Configuration conf) {
        super.setConf(conf);
        if (null != conf) {
            // The MapReduce framework will be using this comparator to sort OrcKey objects
            // output from the map phase, so use the schema defined for the map output key
            // and the data model non-raw compare() implementation.
            schema = TypeDescription.fromString(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()));
            OrcStruct orcRecordModel1 = (OrcStruct) OrcStruct.createValue(schema);
            OrcStruct orcRecordModel2 = (OrcStruct) OrcStruct.createValue(schema);
            if (key1 == null) {
                key1 = new OrcKey();
            }
            if (key2 == null) {
                key2 = new OrcKey();
            }
            if (buffer == null) {
                buffer = new DataInputBuffer();
            }
            key1.key = orcRecordModel1;
            key2.key = orcRecordModel2;
        }
    }

    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        try {
            // parse key1
            buffer.reset(b1, s1, l1);
            key1.readFields(buffer);
            // parse key2
            buffer.reset(b2, s2, l2);
            key2.readFields(buffer);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        // compare them
        return compare(key1, key2);
    }

    @Override
    public int compare(OrcKey o1, OrcKey o2) {
        if (!(o1.key instanceof OrcStruct) || !(o2.key instanceof OrcStruct)) {
            throw new IllegalStateException("OrcKey should have its key value be instance of OrcStruct");
        }
        return ((OrcStruct) o1.key).compareTo((OrcStruct) o2.key);
    }
}

19 View Complete Implementation : OrcConverter.java
Copyright Apache License 2.0
Author : pentaho
public RowMetaAndData convertFromOrc(VectorizedRowBatch batch, int currentBatchRow, List<? extends IOrcInputField> dialogInputFields, TypeDescription typeDescription, Map<String, Integer> schemaToOrcSubcripts, List<? extends IOrcInputField> orcInputFields) {
    return convertFromOrc(new RowMetaAndData(), batch, currentBatchRow, dialogInputFields, typeDescription, schemaToOrcSubcripts, orcInputFields);
}

19 View Complete Implementation : OsmChangesetXml2Orc.java
Copyright ISC License
Author : mojodna
public clreplaced OsmChangesetXml2Orc {

    private static final TypeDescription SCHEMA = createStruct().addField(Changeset.ID, createLong()).addField("tags", createMap(createString(), createString())).addField(Changeset.CREATED_AT, createTimestamp()).addField(Changeset.OPEN, createBoolean()).addField(Changeset.CLOSED_AT, createTimestamp()).addField(Changeset.COMMENTS_COUNT, createLong()).addField(Changeset.MIN_LAT, createDecimal().withScale(7).withPrecision(9)).addField(Changeset.MAX_LAT, createDecimal().withScale(7).withPrecision(9)).addField(Changeset.MIN_LON, createDecimal().withScale(7).withPrecision(10)).addField(Changeset.MAX_LON, createDecimal().withScale(7).withPrecision(10)).addField(Changeset.NUM_CHANGES, createLong()).addField(Changeset.UID, createLong()).addField(Changeset.USER, createString());

    private InputStream inputStream;

    private String outputOrc;

    public OsmChangesetXml2Orc(InputStream inputStream, String outputOrc) {
        this.inputStream = inputStream;
        this.outputOrc = outputOrc;
    }

    public void convert() throws Exception {
        // Setup ORC writer
        Configuration conf = new Configuration();
        conf.setBoolean(OrcConf.BLOCK_PADDING.getAttribute(), false);
        Writer writer = OrcFile.createWriter(new Path(outputOrc), OrcFile.writerOptions(conf).setSchema(SCHEMA));
        // Setup ORC vectors
        VectorizedRowBatch batch = SCHEMA.createRowBatch();
        LongColumnVector id = (LongColumnVector) batch.cols[0];
        MapColumnVector tags = (MapColumnVector) batch.cols[1];
        TimestampColumnVector createdAt = (TimestampColumnVector) batch.cols[2];
        LongColumnVector open = (LongColumnVector) batch.cols[3];
        TimestampColumnVector closedAt = (TimestampColumnVector) batch.cols[4];
        LongColumnVector commentsCount = (LongColumnVector) batch.cols[5];
        DecimalColumnVector minLat = (DecimalColumnVector) batch.cols[6];
        DecimalColumnVector maxLat = (DecimalColumnVector) batch.cols[7];
        DecimalColumnVector minLon = (DecimalColumnVector) batch.cols[8];
        DecimalColumnVector maxLon = (DecimalColumnVector) batch.cols[9];
        LongColumnVector numChanges = (LongColumnVector) batch.cols[10];
        LongColumnVector uid = (LongColumnVector) batch.cols[11];
        BytesColumnVector user = (BytesColumnVector) batch.cols[12];
        // Parse Changeset XML
        SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
        parser.parse(new InputSource(new InputStreamReader(inputStream, "UTF-8")), new ChangesetXmlHandler(changeset -> {
            int row;
            if (batch.size == batch.getMaxSize()) {
                try {
                    writer.addRowBatch(batch);
                    batch.reset();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            row = batch.size++;
            id.vector[row] = changeset.getId();
            try {
                createdAt.time[row] = changeset.getCreatedAt().getTimestamp().getTime();
            } catch (Exception e) {
                createdAt.time[row] = 0;
                createdAt.isNull[row] = true;
            }
            createdAt.nanos[row] = 0;
            try {
                closedAt.time[row] = changeset.getClosedAt().getTimestamp().getTime();
            } catch (Exception e) {
                closedAt.time[row] = 0;
                closedAt.isNull[row] = true;
            }
            closedAt.nanos[row] = 0;
            if (changeset.isOpen()) {
                open.vector[row] = 1;
            } else {
                open.vector[row] = 0;
            }
            numChanges.vector[row] = changeset.getNumChanges();
            if (changeset.getUser() != null) {
                user.setVal(row, changeset.getUser().getBytes());
            } else {
                user.setVal(row, new byte[0]);
                user.isNull[row] = true;
            }
            if (changeset.getUid() != null) {
                uid.vector[row] = changeset.getUid();
            } else {
                uid.isNull[row] = true;
            }
            // We've kept these parsed values as string
            // to guarantee no double precision loss.
            String minLatStr = changeset.getMinLat();
            String maxLatStr = changeset.getMaxLat();
            String minLonStr = changeset.getMinLon();
            String maxLonStr = changeset.getMaxLon();
            if (minLatStr != null) {
                minLat.set(row, HiveDecimal.create(new BigDecimal(minLatStr)));
            } else {
                minLat.set(row, (HiveDecimal) null);
                minLat.isNull[row] = true;
            }
            if (maxLatStr != null) {
                maxLat.set(row, HiveDecimal.create(new BigDecimal(maxLatStr)));
            } else {
                maxLat.set(row, (HiveDecimal) null);
                maxLat.isNull[row] = true;
            }
            if (minLonStr != null) {
                minLon.set(row, HiveDecimal.create(new BigDecimal(minLonStr)));
            } else {
                minLon.set(row, (HiveDecimal) null);
                minLon.isNull[row] = true;
            }
            if (maxLonStr != null) {
                maxLon.set(row, HiveDecimal.create(new BigDecimal(maxLonStr)));
            } else {
                maxLon.set(row, (HiveDecimal) null);
                maxLon.isNull[row] = true;
            }
            commentsCount.vector[row] = changeset.getCommentsCount();
            // tags
            tags.offsets[row] = tags.childCount;
            Map<String, String> _tags = changeset.getTags();
            tags.lengths[row] = _tags.size();
            tags.childCount += tags.lengths[row];
            tags.keys.ensureSize(tags.childCount, tags.offsets[row] != 0);
            tags.values.ensureSize(tags.childCount, tags.offsets[row] != 0);
            int i = 0;
            for (Map.Entry<String, String> kv : _tags.entrySet()) {
                ((BytesColumnVector) tags.keys).setVal((int) tags.offsets[row] + i, kv.getKey().getBytes());
                ((BytesColumnVector) tags.values).setVal((int) tags.offsets[row] + i, kv.getValue().getBytes());
                ++i;
            }
        }));
        // flush any pending rows
        writer.addRowBatch(batch);
        writer.close();
    }
}

19 View Complete Implementation : OrcStructConverter.java
Copyright Apache License 2.0
Author : apache
@Nullable
private static Object convertPrimitive(TypeDescription fieldDescription, @Nullable WritableComparable field, boolean binaryreplacedtring) {
    if (field == null) {
        return null;
    }
    /*
        ORC TYPE    WRITABLE TYPE
        binary      org.apache.hadoop.io.BytesWritable
        bigint      org.apache.hadoop.io.LongWritable
        boolean     org.apache.hadoop.io.BooleanWritable
        char        org.apache.hadoop.io.Text
        date        org.apache.hadoop.hive.serde2.io.DateWritable
        decimal     org.apache.hadoop.hive.serde2.io.HiveDecimalWritable
        double      org.apache.hadoop.io.DoubleWritable
        float       org.apache.hadoop.io.FloatWritable
        int         org.apache.hadoop.io.IntWritable
        smallint    org.apache.hadoop.io.ShortWritable
        string      org.apache.hadoop.io.Text
        timestamp   org.apache.orc.mapred.OrcTimestamp
        tinyint     org.apache.hadoop.io.ByteWritable
        varchar     org.apache.hadoop.io.Text
     */
    switch(fieldDescription.getCategory()) {
        case STRING:
        case CHAR:
        case VARCHAR:
            return ((Text) field).toString();
        case BOOLEAN:
            return ((BooleanWritable) field).get();
        case BYTE:
            return ((ByteWritable) field).get();
        case SHORT:
            return ((ShortWritable) field).get();
        case INT:
            return ((IntWritable) field).get();
        case LONG:
            return ((LongWritable) field).get();
        case FLOAT:
            return ((FloatWritable) field).get();
        case DOUBLE:
            return ((DoubleWritable) field).get();
        case DECIMAL:
            return ((HiveDecimalWritable) field).getHiveDecimal().doubleValue();
        case TIMESTAMP:
            return ((OrcTimestamp) field).getTime();
        case DATE:
            return DateTimes.utc(((DateWritable) field).get().getTime());
        case BINARY:
            byte[] bytes = ((BytesWritable) field).getBytes();
            if (binaryreplacedtring) {
                return StringUtils.fromUtf8(bytes);
            } else {
                return bytes;
            }
        default:
            return null;
    }
}

19 View Complete Implementation : TypeConversion.java
Copyright Apache License 2.0
Author : Netflix
/**
 * Convert an ORC schema to an Iceberg schema.
 * @param schema the ORC schema
 * @param columnIds the column ids
 * @return the Iceberg schema
 */
public Schema fromOrc(TypeDescription schema, ColumnIdMap columnIds) {
    return new Schema(convertOrcToType(schema, columnIds).replacedtructType().fields());
}

19 View Complete Implementation : ORCSchemaUtil.java
Copyright Apache License 2.0
Author : apache
private static boolean isRequired(TypeDescription orcType) {
    String isRequiredStr = orcType.getAttributeValue(ICEBERG_REQUIRED_ATTRIBUTE);
    if (isRequiredStr != null) {
        return Boolean.parseBoolean(isRequiredStr);
    }
    return false;
}

19 View Complete Implementation : OrcWriter.java
Copyright ISC License
Author : mojodna
public clreplaced OrcWriter implements Sink {

    private static final Logger LOG = Logger.getLogger(OrcWriter.clreplaced.getName());

    private static final TypeDescription SCHEMA = createStruct().addField("id", createLong()).addField("type", createString()).addField("tags", createMap(createString(), createString())).addField("lat", createDecimal().withScale(7).withPrecision(9)).addField("lon", createDecimal().withScale(7).withPrecision(10)).addField("nds", createList(createStruct().addField("ref", createLong()))).addField("members", createList(createStruct().addField("type", createString()).addField("ref", createLong()).addField("role", createString()))).addField("changeset", createLong()).addField("timestamp", createTimestamp()).addField("uid", createLong()).addField("user", createString()).addField("version", createLong()).addField("visible", createBoolean());

    private OrcEnreplacedyProcessor processor;

    private String filename;

    private clreplaced OrcEnreplacedyProcessor implements EnreplacedyProcessor {

        private final Writer writer;

        private final VectorizedRowBatch batch;

        private int row;

        OrcEnreplacedyProcessor(Writer writer, VectorizedRowBatch batch) {
            this.writer = writer;
            this.batch = batch;
        }

        private void checkLimit() {
            if (batch.size == batch.getMaxSize()) {
                try {
                    writer.addRowBatch(batch);
                    batch.reset();
                } catch (IOException e) {
                    throw new OsmosisRuntimeException(e);
                }
            }
            row = batch.size++;
        }

        private void addCommonProperties(EnreplacedyContainer container) {
            LongColumnVector id = (LongColumnVector) batch.cols[0];
            BytesColumnVector type = (BytesColumnVector) batch.cols[1];
            MapColumnVector tags = (MapColumnVector) batch.cols[2];
            ListColumnVector nds = (ListColumnVector) batch.cols[5];
            ListColumnVector members = (ListColumnVector) batch.cols[6];
            LongColumnVector changeset = (LongColumnVector) batch.cols[7];
            TimestampColumnVector timestamp = (TimestampColumnVector) batch.cols[8];
            LongColumnVector uid = (LongColumnVector) batch.cols[9];
            BytesColumnVector user = (BytesColumnVector) batch.cols[10];
            LongColumnVector version = (LongColumnVector) batch.cols[11];
            LongColumnVector visible = (LongColumnVector) batch.cols[12];
            Enreplacedy enreplacedy = container.getEnreplacedy();
            id.vector[row] = enreplacedy.getId();
            changeset.vector[row] = enreplacedy.getChangesetId();
            type.setVal(row, enreplacedy.getType().toString().toLowerCase().getBytes());
            tags.offsets[row] = tags.childCount;
            // number of key/value pairings
            tags.lengths[row] = enreplacedy.getTags().size();
            tags.childCount += tags.lengths[row];
            tags.keys.ensureSize(tags.childCount, tags.offsets[row] != 0);
            tags.values.ensureSize(tags.childCount, tags.offsets[row] != 0);
            int i = 0;
            for (Tag tag : enreplacedy.getTags()) {
                ((BytesColumnVector) tags.keys).setVal((int) tags.offsets[row] + i, tag.getKey().getBytes());
                ((BytesColumnVector) tags.values).setVal((int) tags.offsets[row] + i, tag.getValue().getBytes());
                i++;
            }
            timestamp.time[row] = enreplacedy.getTimestamp().getTime();
            timestamp.nanos[row] = 0;
            uid.vector[row] = enreplacedy.getUser().getId();
            user.setVal(row, enreplacedy.getUser().getName().getBytes());
            version.vector[row] = enreplacedy.getVersion();
            visible.vector[row] = 1;
            if (enreplacedy.getMetaTags().get("visible") == Boolean.FALSE) {
                visible.vector[row] = 0;
            }
            nds.offsets[row] = nds.childCount;
            nds.lengths[row] = 0;
            members.offsets[row] = members.childCount;
            members.lengths[row] = 0;
        }

        @Override
        public void process(BoundContainer bound) {
        // TODO set bounds in metadata
        }

        @Override
        public void process(NodeContainer container) {
            DecimalColumnVector lat = (DecimalColumnVector) batch.cols[3];
            DecimalColumnVector lon = (DecimalColumnVector) batch.cols[4];
            checkLimit();
            addCommonProperties(container);
            Node node = container.getEnreplacedy();
            lat.set(row, HiveDecimal.create(BigDecimal.valueOf(node.getLareplacedude())));
            lon.set(row, HiveDecimal.create(BigDecimal.valueOf(node.getLongitude())));
        }

        @Override
        public void process(WayContainer container) {
            DecimalColumnVector lat = (DecimalColumnVector) batch.cols[3];
            DecimalColumnVector lon = (DecimalColumnVector) batch.cols[4];
            ListColumnVector nds = (ListColumnVector) batch.cols[5];
            checkLimit();
            addCommonProperties(container);
            lat.isNull[row] = true;
            lon.isNull[row] = true;
            lat.set(row, (HiveDecimal) null);
            lon.set(row, (HiveDecimal) null);
            Way way = container.getEnreplacedy();
            nds.lengths[row] = way.getWayNodes().size();
            nds.childCount += nds.lengths[row];
            nds.child.ensureSize(nds.childCount, nds.offsets[row] != 0);
            for (int j = 0; j < way.getWayNodes().size(); j++) {
                StructColumnVector ndsStruct = (StructColumnVector) nds.child;
                ((LongColumnVector) ndsStruct.fields[0]).vector[(int) nds.offsets[row] + j] = way.getWayNodes().get(j).getNodeId();
            }
        }

        @Override
        public void process(RelationContainer container) {
            DecimalColumnVector lat = (DecimalColumnVector) batch.cols[3];
            DecimalColumnVector lon = (DecimalColumnVector) batch.cols[4];
            ListColumnVector members = (ListColumnVector) batch.cols[6];
            checkLimit();
            addCommonProperties(container);
            lat.isNull[row] = true;
            lon.isNull[row] = true;
            lat.set(row, (HiveDecimal) null);
            lon.set(row, (HiveDecimal) null);
            Relation relation = container.getEnreplacedy();
            members.lengths[row] = relation.getMembers().size();
            members.childCount += members.lengths[row];
            members.child.ensureSize(members.childCount, members.offsets[row] != 0);
            for (int j = 0; j < relation.getMembers().size(); j++) {
                StructColumnVector membersStruct = (StructColumnVector) members.child;
                ((BytesColumnVector) membersStruct.fields[0]).setVal((int) members.offsets[row] + j, relation.getMembers().get(j).getMemberType().toString().toLowerCase().getBytes());
                ((LongColumnVector) membersStruct.fields[1]).vector[(int) members.offsets[row] + j] = relation.getMembers().get(j).getMemberId();
                ((BytesColumnVector) membersStruct.fields[2]).setVal((int) members.offsets[row] + j, relation.getMembers().get(j).getMemberRole().getBytes());
            }
        }

        void flush() throws IOException {
            writer.addRowBatch(batch);
        }

        void close() throws IOException {
            flush();
            writer.close();
        }
    }

    OrcWriter(String filename) {
        this.filename = filename;
    }

    @Override
    public void process(EnreplacedyContainer enreplacedyContainer) {
        enreplacedyContainer.process(processor);
    }

    @Override
    public void initialize(Map<String, Object> metaData) {
        try {
            Configuration conf = new Configuration();
            // conf.set(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), "tags");
            processor = new OrcEnreplacedyProcessor(OrcFile.createWriter(new Path(filename), OrcFile.writerOptions(conf).setSchema(SCHEMA)), SCHEMA.createRowBatch());
        } catch (IOException e) {
            throw new OsmosisRuntimeException(e);
        }
    }

    @Override
    public void complete() {
        try {
            // flush any pending rows
            processor.close();
            processor = null;
        } catch (IOException e) {
            throw new OsmosisRuntimeException("Unable to complete the ORC file.", e);
        }
    }

    @Override
    public void close() {
    }
}

19 View Complete Implementation : GenericOrcWriter.java
Copyright Apache License 2.0
Author : apache
public static OrcValueWriter<Record> buildWriter(TypeDescription fileSchema) {
    return new GenericOrcWriter(fileSchema);
}

19 View Complete Implementation : DefaultORCSchemaProvider.java
Copyright Apache License 2.0
Author : pinterest
@Override
public TypeDescription getSchema(String topic, LogFilePath logFilePath) {
    TypeDescription topicSpecificTD = topicToSchemaMap.get(topic);
    if (null != topicSpecificTD) {
        return topicSpecificTD;
    }
    return schemaForAlltopic;
}

19 View Complete Implementation : ColumnIdMap.java
Copyright Apache License 2.0
Author : Netflix
@Override
public Integer put(TypeDescription key, Integer value) {
    return idMap.put(key, value);
}

19 View Complete Implementation : OrcStructConverter.java
Copyright Apache License 2.0
Author : apache
@Nonnull
private static List<Object> convertList(TypeDescription fieldDescription, OrcList orcList, boolean binaryreplacedtring) {
    // if primitive list, convert primitives
    TypeDescription listType = fieldDescription.getChildren().get(0);
    if (listType.getCategory().isPrimitive()) {
        return (List<Object>) orcList.stream().map(li -> convertPrimitive(listType, (WritableComparable) li, binaryreplacedtring)).collect(Collectors.toList());
    }
    return new ArrayList<Object>(orcList);
}

19 View Complete Implementation : GenericOrcWriter.java
Copyright Apache License 2.0
Author : apache
private static Converter buildConverter(TypeDescription schema) {
    switch(schema.getCategory()) {
        case BOOLEAN:
            return new BooleanConverter();
        case BYTE:
            return new ByteConverter();
        case SHORT:
            return new ShortConverter();
        case DATE:
        case INT:
            return new IntConverter();
        case LONG:
            return new LongConverter();
        case FLOAT:
            return new FloatConverter();
        case DOUBLE:
            return new DoubleConverter();
        case BINARY:
            return new BytesConverter();
        case STRING:
        case CHAR:
        case VARCHAR:
            return new StringConverter();
        case DECIMAL:
            return schema.getPrecision() <= 18 ? new Decimal18Converter(schema) : new Decimal38Converter(schema);
        case TIMESTAMP:
            return new TimestampConverter();
        case STRUCT:
            return new StructConverter(schema);
        case LIST:
            return new ListConverter(schema);
        case MAP:
            return new MapConverter(schema);
    }
    throw new IllegalArgumentException("Unhandled type " + schema);
}

19 View Complete Implementation : GenericOrcReader.java
Copyright Apache License 2.0
Author : apache
public static OrcValueReader<Record> buildReader(Schema expectedSchema, TypeDescription fileSchema) {
    return new GenericOrcReader(expectedSchema, fileSchema);
}

19 View Complete Implementation : GenericOrcReader.java
Copyright Apache License 2.0
Author : apache
private static Converter buildConverter(final Types.NestedField icebergField, final TypeDescription schema) {
    switch(schema.getCategory()) {
        case BOOLEAN:
            return new BooleanConverter();
        case BYTE:
            return new ByteConverter();
        case SHORT:
            return new ShortConverter();
        case DATE:
        case INT:
            return new IntConverter();
        case LONG:
            return new LongConverter();
        case FLOAT:
            return new FloatConverter();
        case DOUBLE:
            return new DoubleConverter();
        case TIMESTAMP:
            return new TimestampConverter();
        case DECIMAL:
            return new DecimalConverter();
        case BINARY:
            return new BinaryConverter();
        case STRING:
        case CHAR:
        case VARCHAR:
            return new StringConverter();
        case STRUCT:
            return new StructConverter(icebergField, schema);
        case LIST:
            return new ListConverter(icebergField, schema);
        case MAP:
            return new MapConverter(icebergField, schema);
        default:
            throw new IllegalArgumentException("Unhandled type " + schema);
    }
}

19 View Complete Implementation : OrcShimV200.java
Copyright Apache License 2.0
Author : apache
/**
 * Computes the ORC projection mask of the fields to include from the selected fields.rowOrcInputFormat.nextRecord(null).
 *
 * @return The ORC projection mask.
 */
private static boolean[] computeProjectionMask(TypeDescription schema, int[] selectedFields) {
    // mask with all fields of the schema
    boolean[] projectionMask = new boolean[schema.getMaximumId() + 1];
    // for each selected field
    for (int inIdx : selectedFields) {
        // set all nested fields of a selected field to true
        TypeDescription fieldSchema = schema.getChildren().get(inIdx);
        for (int i = fieldSchema.getId(); i <= fieldSchema.getMaximumId(); i++) {
            projectionMask[i] = true;
        }
    }
    return projectionMask;
}

19 View Complete Implementation : OrcRowSplitReader.java
Copyright Apache License 2.0
Author : apache
/**
 * {@link OrcSplitReader} to read ORC files into {@link Row}.
 */
public clreplaced OrcRowSplitReader extends OrcSplitReader<Row> {

    private final TypeDescription schema;

    private final int[] selectedFields;

    // the vector of rows that is read in a batch
    private final Row[] rows;

    public OrcRowSplitReader(Configuration conf, TypeDescription schema, int[] selectedFields, List<Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
        super(OrcShim.defaultShim(), conf, schema, selectedFields, conjunctPredicates, batchSize, path, splitStart, splitLength);
        this.schema = schema;
        this.selectedFields = selectedFields;
        // create and initialize the row batch
        this.rows = new Row[batchSize];
        for (int i = 0; i < batchSize; i++) {
            rows[i] = new Row(selectedFields.length);
        }
    }

    @Override
    protected int fillRows() {
        return OrcBatchReader.fillRows(rows, schema, rowBatch, selectedFields);
    }

    @Override
    public Row nextRecord(Row reuse) {
        // return the next row
        return rows[this.nextRow++];
    }
}

19 View Complete Implementation : OrcFileAppender.java
Copyright Apache License 2.0
Author : Netflix
/**
 * Create a file appender for ORC.
 */
public clreplaced OrcFileAppender implements FileAppender<VectorizedRowBatch> {

    private final Writer writer;

    private final TypeDescription orcSchema;

    private final ColumnIdMap columnIds = new ColumnIdMap();

    private final Path path;

    public static final String COLUMN_NUMBERS_ATTRIBUTE = "iceberg.column.ids";

    OrcFileAppender(Schema schema, OutputFile file, OrcFile.WriterOptions options, Map<String, byte[]> metadata) {
        orcSchema = TypeConversion.toOrc(schema, columnIds);
        options.setSchema(orcSchema);
        path = new Path(file.location());
        try {
            writer = OrcFile.createWriter(path, options);
        } catch (IOException e) {
            throw new RuntimeException("Can't create file " + path, e);
        }
        writer.addUserMetadata(COLUMN_NUMBERS_ATTRIBUTE, columnIds.serialize());
        metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));
    }

    @Override
    public void add(VectorizedRowBatch datum) {
        try {
            writer.addRowBatch(datum);
        } catch (IOException e) {
            throw new RuntimeException("Problem writing to ORC file " + path, e);
        }
    }

    @Override
    public Metrics metrics() {
        try {
            long rows = writer.getNumberOfRows();
            ColumnStatistics[] stats = writer.getStatistics();
            // we don't currently have columnSizes or distinct counts.
            Map<Integer, Long> valueCounts = new HashMap<>();
            Map<Integer, Long> nullCounts = new HashMap<>();
            Integer[] icebergIds = new Integer[orcSchema.getMaximumId() + 1];
            for (TypeDescription type : columnIds.keySet()) {
                icebergIds[type.getId()] = columnIds.get(type);
            }
            for (int c = 1; c < stats.length; ++c) {
                if (icebergIds[c] != null) {
                    valueCounts.put(icebergIds[c], stats[c].getNumberOfValues());
                }
            }
            for (TypeDescription child : orcSchema.getChildren()) {
                int c = child.getId();
                if (icebergIds[c] != null) {
                    nullCounts.put(icebergIds[c], rows - stats[c].getNumberOfValues());
                }
            }
            return new Metrics(rows, null, valueCounts, nullCounts);
        } catch (IOException e) {
            throw new RuntimeException("Can't get statistics " + path, e);
        }
    }

    @Override
    public void close() throws IOException {
        writer.close();
    }

    public TypeDescription getSchema() {
        return orcSchema;
    }
}

18 View Complete Implementation : SparkOrcReader.java
Copyright Apache License 2.0
Author : apache
static Converter buildConverter(final TypeDescription schema) {
    switch(schema.getCategory()) {
        case BOOLEAN:
            return new BooleanConverter();
        case BYTE:
            return new ByteConverter();
        case SHORT:
            return new ShortConverter();
        case DATE:
        case INT:
            return new IntConverter();
        case LONG:
            return new LongConverter();
        case FLOAT:
            return new FloatConverter();
        case DOUBLE:
            return new DoubleConverter();
        case TIMESTAMP:
            return new TimestampConverter();
        case DECIMAL:
            if (schema.getPrecision() <= Decimal.MAX_LONG_DIGITS()) {
                return new Decimal18Converter(schema.getPrecision(), schema.getScale());
            } else {
                return new Decimal38Converter(schema.getPrecision(), schema.getScale());
            }
        case BINARY:
        case STRING:
        case CHAR:
        case VARCHAR:
            return new BinaryConverter();
        case STRUCT:
            return new StructConverter(schema);
        case LIST:
            return new ListConverter(schema);
        case MAP:
            return new MapConverter(schema);
        default:
            throw new IllegalArgumentException("Unhandled type " + schema);
    }
}

18 View Complete Implementation : DremioORCRecordUtils.java
Copyright Apache License 2.0
Author : dremio
/**
 * Plans the list of disk ranges that the given stripe needs to read the
 * indexes. All of the positions are relative to the start of the stripe.
 * @param  fileSchema the schema for the file
 * @param footer the stripe footer
 * @param ignoreNonUtf8BloomFilter should the reader ignore non-utf8
 *                                 encoded bloom filters
 * @param fileIncluded the columns (indexed by file columns) that should be
 *                     read
 * @param sargColumns true for the columns (indexed by file columns) that
 *                    we need bloom filters for
 * @param version the version of the software that wrote the file
 * @param bloomFilterKinds (output) the stream kind of the bloom filters
 * @return a list of merged disk ranges to read
 */
public static DiskRangeList planIndexReading(TypeDescription fileSchema, OrcProto.StripeFooter footer, boolean ignoreNonUtf8BloomFilter, boolean[] fileIncluded, boolean[] sargColumns, OrcFile.WriterVersion version, OrcProto.Stream.Kind[] bloomFilterKinds) {
    DiskRangeList.CreateHelper result = new DiskRangeList.CreateHelper();
    List<OrcProto.Stream> streams = footer.getStreamsList();
    // figure out which kind of bloom filter we want for each column
    // picks bloom_filter_utf8 if its available, otherwise bloom_filter
    if (sargColumns != null) {
        for (OrcProto.Stream stream : streams) {
            if (stream.hasKind() && stream.hasColumn()) {
                int column = stream.getColumn();
                if (sargColumns[column]) {
                    switch(stream.getKind()) {
                        case BLOOM_FILTER:
                            if (bloomFilterKinds[column] == null && !(ignoreNonUtf8BloomFilter && hadBadBloomFilters(fileSchema.findSubtype(column).getCategory(), version))) {
                                bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER;
                            }
                            break;
                        case BLOOM_FILTER_UTF8:
                            bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
                            break;
                        default:
                            break;
                    }
                }
            }
        }
    }
    long offset = 0;
    for (OrcProto.Stream stream : footer.getStreamsList()) {
        if (stream.hasKind() && stream.hasColumn()) {
            int column = stream.getColumn();
            if (fileIncluded == null || fileIncluded[column]) {
                boolean needStream = false;
                switch(stream.getKind()) {
                    case ROW_INDEX:
                        needStream = true;
                        break;
                    case BLOOM_FILTER:
                        needStream = bloomFilterKinds[column] == OrcProto.Stream.Kind.BLOOM_FILTER;
                        break;
                    case BLOOM_FILTER_UTF8:
                        needStream = bloomFilterKinds[column] == OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
                        break;
                    default:
                        // Preplaced
                        break;
                }
                if (needStream) {
                    result.addOrMerge(offset, offset + stream.getLength(), true, false);
                }
            }
        }
        offset += stream.getLength();
    }
    return result.get();
}

18 View Complete Implementation : OrcRowInputFormat.java
Copyright Apache License 2.0
Author : ljygz
/**
 * Computes the ORC projection mask of the fields to include from the selected fields.rowOrcInputFormat.nextRecord(null).
 *
 * @return The ORC projection mask.
 */
private boolean[] computeProjectionMask() {
    // mask with all fields of the schema
    boolean[] projectionMask = new boolean[schema.getMaximumId() + 1];
    // for each selected field
    for (int inIdx : selectedFields) {
        // set all nested fields of a selected field to true
        TypeDescription fieldSchema = schema.getChildren().get(inIdx);
        for (int i = fieldSchema.getId(); i <= fieldSchema.getMaximumId(); i++) {
            projectionMask[i] = true;
        }
    }
    return projectionMask;
}

18 View Complete Implementation : ColumnIdMap.java
Copyright Apache License 2.0
Author : Netflix
public static ColumnIdMap deserialize(TypeDescription schema, ByteBuffer serial) {
    ColumnIdMap result = new ColumnIdMap();
    String[] parts = StandardCharsets.UTF_8.decode(serial).toString().split(",");
    for (int i = 0; i < parts.length; ++i) {
        String[] subparts = parts[i].split(":");
        result.put(schema.findSubtype(Integer.parseInt(subparts[0])), Integer.parseInt(subparts[1]));
    }
    return result;
}

18 View Complete Implementation : SparkOrcReader.java
Copyright Apache License 2.0
Author : Netflix
static int getArrayElementSize(TypeDescription type) {
    switch(type.getCategory()) {
        case BOOLEAN:
        case BYTE:
            return 1;
        case SHORT:
            return 2;
        case INT:
        case FLOAT:
            return 4;
        default:
            return 8;
    }
}

18 View Complete Implementation : OrcSchemaConverter.java
Copyright Apache License 2.0
Author : pentaho
public TypeDescription buildTypeDescription(List<? extends IOrcOutputField> fields) {
    TypeDescription typeDescription = TypeDescription.createStruct();
    fields.forEach(field -> addStructField(typeDescription, field));
    return typeDescription;
}

18 View Complete Implementation : PentahoOrcOutputFormat.java
Copyright Apache License 2.0
Author : pentaho
@Override
public IPentareplacedcordWriter createRecordWriter() {
    logger.logDetailed("Initializing Orc Writer");
    if (fields == null) {
        throw new IllegalStateException("Invalid state.  The fields to write are null");
    }
    if (outputFilename == null) {
        throw new IllegalStateException("Invalid state.  The outputFileName is null");
    }
    OrcSchemaConverter converter = new OrcSchemaConverter();
    TypeDescription schema = converter.buildTypeDescription(fields);
    return new PentahoOrcRecordWriter(fields, schema, outputFilename, conf);
}

18 View Complete Implementation : SparkOrcReader.java
Copyright Apache License 2.0
Author : apache
private static int getArrayElementSize(TypeDescription type) {
    switch(type.getCategory()) {
        case BOOLEAN:
        case BYTE:
            return 1;
        case SHORT:
            return 2;
        case INT:
        case FLOAT:
            return 4;
        default:
            return 8;
    }
}

18 View Complete Implementation : OrcStructConverter.java
Copyright Apache License 2.0
Author : apache
/**
 * Convert a orc struct field as though it were a map, by fieldIndex. Complex types will be transformed
 * into java lists and maps when possible ({@link OrcStructConverter#convertList} and
 * {@link OrcStructConverter#convertMap}), and
 * primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally,
 * if a field is not present, this method will return null.
 *
 * Note: "Union" types are not currently supported and will be returned as null
 */
@Nullable
Object convertField(OrcStruct struct, int fieldIndex) {
    if (fieldIndex < 0) {
        return null;
    }
    TypeDescription schema = struct.getSchema();
    TypeDescription fieldDescription = schema.getChildren().get(fieldIndex);
    WritableComparable fieldValue = struct.getFieldValue(fieldIndex);
    if (fieldValue == null) {
        return null;
    }
    if (fieldDescription.getCategory().isPrimitive()) {
        return convertPrimitive(fieldDescription, fieldValue, binaryreplacedtring);
    } else {
        // handle complex column types
        /*
          ORC TYPE    WRITABLE TYPE
          array       org.apache.orc.mapred.OrcList
          map         org.apache.orc.mapred.OrcMap
          struct      org.apache.orc.mapred.OrcStruct
          uniontype   org.apache.orc.mapred.OrcUnion
       */
        switch(fieldDescription.getCategory()) {
            case LIST:
                OrcList orcList = (OrcList) fieldValue;
                return convertList(fieldDescription, orcList, binaryreplacedtring);
            case MAP:
                OrcMap map = (OrcMap) fieldValue;
                return convertMap(fieldDescription, map, binaryreplacedtring);
            case STRUCT:
                OrcStruct structMap = (OrcStruct) fieldValue;
                return convertStructToMap(structMap);
            case UNION:
            // sorry union types :(
            default:
                return null;
        }
    }
}

18 View Complete Implementation : DefaultORCSchemaProvider.java
Copyright Apache License 2.0
Author : pinterest
/**
 * Default implementation for ORC schema provider. It fetches ORC schemas from
 * configuration. User has to specify one schema per kafka topic or can have
 * same schema for all the topics.
 *
 * @author Ashish ([email protected])
 */
public clreplaced DefaultORCSchemaProvider implements ORCSchemaProvider {

    private Map<String, TypeDescription> topicToSchemaMap;

    private TypeDescription schemaForAlltopic;

    public DefaultORCSchemaProvider(SecorConfig config) {
        topicToSchemaMap = new HashMap<String, TypeDescription>();
        setSchemas(config);
    }

    @Override
    public TypeDescription getSchema(String topic, LogFilePath logFilePath) {
        TypeDescription topicSpecificTD = topicToSchemaMap.get(topic);
        if (null != topicSpecificTD) {
            return topicSpecificTD;
        }
        return schemaForAlltopic;
    }

    /**
     * This method is used for fetching all ORC schemas from config
     *
     * @param config
     */
    private void setSchemas(SecorConfig config) {
        Map<String, String> schemaPerTopic = config.getORCMessageSchema();
        for (Entry<String, String> entry : schemaPerTopic.entrySet()) {
            String topic = entry.getKey();
            TypeDescription schema = TypeDescription.fromString(entry.getValue());
            topicToSchemaMap.put(topic, schema);
            // If common schema is given
            if ("*".equals(topic)) {
                schemaForAlltopic = schema;
            }
        }
    }
}

18 View Complete Implementation : OrcStructConverterTest.java
Copyright Apache License 2.0
Author : apache
private static TypeDescription createRootSchema(String fieldName, TypeDescription fieldType) {
    return createRootSchema(Collections.singletonMap(fieldName, fieldType));
}

18 View Complete Implementation : OrcStructConverterTest.java
Copyright Apache License 2.0
Author : apache
private static TypeDescription createRootSchema(Map<String, TypeDescription> fieldTypes) {
    final TypeDescription schema = TypeDescription.createStruct();
    fieldTypes.forEach(schema::addField);
    return schema;
}

18 View Complete Implementation : ColumnIdMap.java
Copyright Apache License 2.0
Author : Netflix
public ByteBuffer serialize() {
    StringBuilder buffer = new StringBuilder();
    boolean needComma = false;
    for (TypeDescription key : idMap.keySet()) {
        if (needComma) {
            buffer.append(',');
        } else {
            needComma = true;
        }
        buffer.append(key.getId());
        buffer.append(':');
        buffer.append(idMap.get(key).intValue());
    }
    return ByteBuffer.wrap(buffer.toString().getBytes(StandardCharsets.UTF_8));
}

18 View Complete Implementation : OrcInputFormat.java
Copyright Apache License 2.0
Author : apache
/**
 * InputFormat to read ORC files.
 */
public abstract clreplaced OrcInputFormat<T> extends FileInputFormat<T> {

    // the number of fields rows to read in a batch
    protected int batchSize;

    // the configuration to read with
    protected Configuration conf;

    // the schema of the ORC files to read
    protected TypeDescription schema;

    // the fields of the ORC schema that the returned Rows are composed of.
    protected int[] selectedFields;

    protected ArrayList<Predicate> conjunctPredicates = new ArrayList<>();

    protected transient OrcSplitReader<T> reader;

    /**
     * Creates an OrcInputFormat.
     *
     * @param path The path to read ORC files from.
     * @param orcSchema The schema of the ORC files as ORC TypeDescription.
     * @param orcConfig The configuration to read the ORC files with.
     * @param batchSize The number of Row objects to read in a batch.
     */
    public OrcInputFormat(Path path, TypeDescription orcSchema, Configuration orcConfig, int batchSize) {
        super(path);
        // configure OrcInputFormat
        this.schema = orcSchema;
        this.conf = orcConfig;
        this.batchSize = batchSize;
        // set default selection mask, i.e., all fields.
        this.selectedFields = new int[this.schema.getChildren().size()];
        for (int i = 0; i < selectedFields.length; i++) {
            this.selectedFields[i] = i;
        }
    }

    /**
     * Selects the fields from the ORC schema that are returned by InputFormat.
     *
     * @param selectedFields The indices of the fields of the ORC schema that are returned by the InputFormat.
     */
    public void selectFields(int... selectedFields) {
        // set field mapping
        this.selectedFields = selectedFields;
    }

    /**
     * Adds a filter predicate to reduce the number of rows to be returned by the input format.
     * Multiple conjunctive predicates can be added by calling this method multiple times.
     *
     * <p>Note: Predicates can significantly reduce the amount of data that is read.
     * However, the OrcInputFormat does not guarantee that all returned rows qualify the
     * predicates. Moreover, predicates are only applied if the referenced field is among the
     * selected fields.
     *
     * @param predicate The filter predicate.
     */
    public void addPredicate(Predicate predicate) {
        // validate
        validatePredicate(predicate);
        // add predicate
        this.conjunctPredicates.add(predicate);
    }

    private void validatePredicate(Predicate pred) {
        if (pred instanceof ColumnPredicate) {
            // check column name
            String colName = ((ColumnPredicate) pred).columnName;
            if (!this.schema.getFieldNames().contains(colName)) {
                throw new IllegalArgumentException("Predicate cannot be applied. " + "Column '" + colName + "' does not exist in ORC schema.");
            }
        } else if (pred instanceof Not) {
            validatePredicate(((Not) pred).child());
        } else if (pred instanceof Or) {
            for (Predicate p : ((Or) pred).children()) {
                validatePredicate(p);
            }
        }
    }

    @Override
    public void close() throws IOException {
        if (reader != null) {
            this.reader.close();
        }
        this.reader = null;
        this.schema = null;
    }

    @Override
    public boolean reachedEnd() throws IOException {
        return reader.reachedEnd();
    }

    @Override
    public T nextRecord(T reuse) throws IOException {
        return reader.nextRecord(reuse);
    }

    @VisibleForTesting
    OrcSplitReader<T> getReader() {
        return reader;
    }

    // --------------------------------------------------------------------------------------------
    // Custom serialization methods
    // --------------------------------------------------------------------------------------------
    private void writeObject(ObjectOutputStream out) throws IOException {
        out.writeInt(batchSize);
        this.conf.write(out);
        out.writeUTF(schema.toString());
        out.writeInt(selectedFields.length);
        for (int f : selectedFields) {
            out.writeInt(f);
        }
        out.writeInt(conjunctPredicates.size());
        for (Predicate p : conjunctPredicates) {
            out.writeObject(p);
        }
    }

    @SuppressWarnings("unchecked")
    private void readObject(ObjectInputStream in) throws IOException, ClreplacedNotFoundException {
        batchSize = in.readInt();
        Configuration configuration = new Configuration();
        configuration.readFields(in);
        if (this.conf == null) {
            this.conf = configuration;
        }
        this.schema = TypeDescription.fromString(in.readUTF());
        this.selectedFields = new int[in.readInt()];
        for (int i = 0; i < selectedFields.length; i++) {
            this.selectedFields[i] = in.readInt();
        }
        this.conjunctPredicates = new ArrayList<>();
        int numPreds = in.readInt();
        for (int i = 0; i < numPreds; i++) {
            conjunctPredicates.add((Predicate) in.readObject());
        }
    }

    @Override
    public boolean supportsMultiPaths() {
        return true;
    }

    // --------------------------------------------------------------------------------------------
    // Getter methods for tests
    // --------------------------------------------------------------------------------------------
    @VisibleForTesting
    Configuration getConfiguration() {
        return conf;
    }

    @VisibleForTesting
    int getBatchSize() {
        return batchSize;
    }

    @VisibleForTesting
    String getSchema() {
        return schema.toString();
    }
}

18 View Complete Implementation : OrcValueMapper.java
Copyright Apache License 2.0
Author : apache
/**
 * To keep consistent with {@link OrcMapreduceRecordReader}'s decision on implementing
 * {@link RecordReader} with {@link NullWritable} as the key and generic type of value, the ORC Mapper will
 * read in the record as the input value.
 */
public clreplaced OrcValueMapper extends RecordKeyMapperBase<NullWritable, OrcStruct, Object, OrcValue> {

    private OrcValue outValue;

    private TypeDescription mapperSchema;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);
        this.outValue = new OrcValue();
        this.mapperSchema = TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute()));
    }

    @Override
    protected void map(NullWritable key, OrcStruct orcStruct, Context context) throws IOException, InterruptedException {
        OrcStruct upConvertedStruct = upConvertOrcStruct(orcStruct, context);
        if (context.getNumReduceTasks() == 0) {
            this.outValue.value = upConvertedStruct;
            context.write(NullWritable.get(), this.outValue);
        } else {
            this.outValue.value = upConvertedStruct;
            context.write(getDedupKey(upConvertedStruct), this.outValue);
        }
        context.getCounter(EVENT_COUNTER.RECORD_COUNT).increment(1);
    }

    /**
     * If a {@link OrcStruct}'s schema differs from newest schema obtained when creating MR jobs (which is the
     * newest schema seen by the MR job), all the other ORC object will need to be up-converted.
     */
    OrcStruct upConvertOrcStruct(OrcStruct orcStruct, Context context) {
        // For ORC schema, if schema object differs that means schema itself is different while for Avro,
        // there are chances that doreplacedentation or attributes' difference lead to the schema object difference.
        if (!orcStruct.getSchema().equals(mapperSchema)) {
            OrcStruct newStruct = new OrcStruct(mapperSchema);
            int indexInNewSchema = 0;
            List<String> oldSchemaFieldNames = orcStruct.getSchema().getFieldNames();
            List<TypeDescription> oldSchemaTypes = orcStruct.getSchema().getChildren();
            List<TypeDescription> newSchemaTypes = mapperSchema.getChildren();
            for (String field : mapperSchema.getFieldNames()) {
                if (oldSchemaFieldNames.contains(field)) {
                    int fieldIndex = oldSchemaFieldNames.indexOf(field);
                    TypeDescription fileType = oldSchemaTypes.get(fieldIndex);
                    TypeDescription readerType = newSchemaTypes.get(indexInNewSchema);
                    if (isEvolutionValid(fileType, readerType)) {
                        newStruct.setFieldValue(field, orcStruct.getFieldValue(field));
                    } else {
                        throw new SchemaEvolution.IllegalEvolutionException(String.format("ORC does not support type conversion from file" + " type %s to reader type %s ", fileType.toString(), readerType.toString()));
                    }
                } else {
                    newStruct.setFieldValue(field, null);
                }
                indexInNewSchema++;
            }
            return newStruct;
        } else {
            return orcStruct;
        }
    }

    /**
     * Determine if two types are following valid evolution.
     * Implementation stolen and manipulated from {@link SchemaEvolution} as that was package-private.
     */
    static boolean isEvolutionValid(TypeDescription fileType, TypeDescription readerType) {
        boolean isOk = true;
        if (fileType.getCategory() == readerType.getCategory()) {
            switch(readerType.getCategory()) {
                case BOOLEAN:
                case BYTE:
                case SHORT:
                case INT:
                case LONG:
                case DOUBLE:
                case FLOAT:
                case STRING:
                case TIMESTAMP:
                case BINARY:
                case DATE:
                    // these are always a match
                    break;
                case CHAR:
                case VARCHAR:
                    break;
                case DECIMAL:
                    break;
                case UNION:
                case MAP:
                case LIST:
                    {
                        // these must be an exact match
                        List<TypeDescription> fileChildren = fileType.getChildren();
                        List<TypeDescription> readerChildren = readerType.getChildren();
                        if (fileChildren.size() == readerChildren.size()) {
                            for (int i = 0; i < fileChildren.size(); ++i) {
                                isOk &= isEvolutionValid(fileChildren.get(i), readerChildren.get(i));
                            }
                            return isOk;
                        } else {
                            return false;
                        }
                    }
                case STRUCT:
                    {
                        List<TypeDescription> readerChildren = readerType.getChildren();
                        List<TypeDescription> fileChildren = fileType.getChildren();
                        List<String> readerFieldNames = readerType.getFieldNames();
                        List<String> fileFieldNames = fileType.getFieldNames();
                        final Map<String, TypeDescription> fileTypesIdx = new HashMap();
                        for (int i = 0; i < fileFieldNames.size(); i++) {
                            final String fileFieldName = fileFieldNames.get(i);
                            fileTypesIdx.put(fileFieldName, fileChildren.get(i));
                        }
                        for (int i = 0; i < readerFieldNames.size(); i++) {
                            final String readerFieldName = readerFieldNames.get(i);
                            TypeDescription readerField = readerChildren.get(i);
                            TypeDescription fileField = fileTypesIdx.get(readerFieldName);
                            if (fileField == null) {
                                continue;
                            }
                            isOk &= isEvolutionValid(fileField, readerField);
                        }
                        return isOk;
                    }
                default:
                    throw new IllegalArgumentException("Unknown type " + readerType);
            }
            return isOk;
        } else {
            /*
       * Check for the few cases where will not convert....
       */
            return ConvertTreeReaderFactory.canConvert(fileType, readerType);
        }
    }

    /**
     * By default, dedup key contains the whole ORC record, except MAP since {@link org.apache.orc.mapred.OrcMap} is
     * an implementation of {@link java.util.TreeMap} which doesn't accept difference of records within the map in comparison.
     */
    protected OrcKey getDedupKey(OrcStruct originalRecord) {
        return convertOrcStructToOrcKey(originalRecord);
    }

    /**
     * The output key of mapper needs to be comparable. In the scenarios that we need the orc record itself
     * to be the output key, this conversion will be necessary.
     */
    protected OrcKey convertOrcStructToOrcKey(OrcStruct struct) {
        OrcKey orcKey = new OrcKey();
        orcKey.key = struct;
        return orcKey;
    }
}

18 View Complete Implementation : OrcKeyComparatorTest.java
Copyright Apache License 2.0
Author : apache
/**
 * Create a {@link OrcList} repeating the given parameter inside the list for multiple times.
 */
private OrcList createOrcList(int element, TypeDescription schema, int num) {
    OrcList result = new OrcList(schema);
    for (int i = 0; i < num; i++) {
        result.add(new IntWritable(element));
    }
    return result;
}

18 View Complete Implementation : OrcFileAppender.java
Copyright Apache License 2.0
Author : apache
@SuppressWarnings("unchecked")
private static <D> OrcValueWriter<D> newOrcValueWriter(TypeDescription schema, Function<TypeDescription, OrcValueWriter<?>> createWriterFunc) {
    return (OrcValueWriter<D>) createWriterFunc.apply(schema);
}

18 View Complete Implementation : OrcSchemaConverter.java
Copyright Apache License 2.0
Author : pentaho
private int determineFormatType(TypeDescription subDescription) {
    switch(subDescription.getCategory().getName()) {
        case "string":
            return OrcSpec.DataType.STRING.getId();
        case "char":
            return OrcSpec.DataType.CHAR.getId();
        case "varchar":
            return OrcSpec.DataType.VARCHAR.getId();
        case "bigint":
            return OrcSpec.DataType.BIGINT.getId();
        case "float":
            return OrcSpec.DataType.FLOAT.getId();
        case "double":
            return OrcSpec.DataType.DOUBLE.getId();
        case "decimal":
            return OrcSpec.DataType.DECIMAL.getId();
        case "timestamp":
            return OrcSpec.DataType.TIMESTAMP.getId();
        case "date":
            return OrcSpec.DataType.DATE.getId();
        case "boolean":
            return OrcSpec.DataType.BOOLEAN.getId();
        case "binary":
            return OrcSpec.DataType.BINARY.getId();
        case "int":
            return OrcSpec.DataType.INTEGER.getId();
        case "tinyint":
            return OrcSpec.DataType.TINYINT.getId();
        case "smallint":
            return OrcSpec.DataType.SMALLINT.getId();
    }
    // if none of the cases match return a -1
    return -1;
}

18 View Complete Implementation : OrcStructConverter.java
Copyright Apache License 2.0
Author : apache
private static Map<Object, Object> convertMap(TypeDescription fieldDescription, OrcMap<? extends WritableComparable, ? extends WritableComparable> map, boolean binaryreplacedtring) {
    Map<Object, Object> converted = new HashMap<>();
    TypeDescription keyDescription = fieldDescription.getChildren().get(0);
    TypeDescription valueDescription = fieldDescription.getChildren().get(1);
    for (WritableComparable key : map.navigableKeySet()) {
        Object newKey = convertPrimitive(keyDescription, key, binaryreplacedtring);
        if (valueDescription.getCategory().isPrimitive()) {
            converted.put(newKey, convertPrimitive(valueDescription, map.get(key), binaryreplacedtring));
        } else {
            converted.put(newKey, map.get(key));
        }
    }
    return converted;
}

18 View Complete Implementation : DefaultORCSchemaProvider.java
Copyright Apache License 2.0
Author : pinterest
/**
 * This method is used for fetching all ORC schemas from config
 *
 * @param config
 */
private void setSchemas(SecorConfig config) {
    Map<String, String> schemaPerTopic = config.getORCMessageSchema();
    for (Entry<String, String> entry : schemaPerTopic.entrySet()) {
        String topic = entry.getKey();
        TypeDescription schema = TypeDescription.fromString(entry.getValue());
        topicToSchemaMap.put(topic, schema);
        // If common schema is given
        if ("*".equals(topic)) {
            schemaForAlltopic = schema;
        }
    }
}

18 View Complete Implementation : ORCSchemaUtil.java
Copyright Apache License 2.0
Author : apache
private static Optional<Integer> icebergID(TypeDescription orcType) {
    return Optional.ofNullable(orcType.getAttributeValue(ICEBERG_ID_ATTRIBUTE)).map(Integer::parseInt);
}

18 View Complete Implementation : TestORCSchemaUtil.java
Copyright Apache License 2.0
Author : apache
@Test
public void testRoundtripConversionNested() {
    Types.StructType leafStructType = Types.StructType.of(optional(6, "leafLongCol", Types.LongType.get()), optional(7, "leafBinaryCol", Types.BinaryType.get()));
    Types.StructType nestedStructType = Types.StructType.of(optional(4, "longCol", Types.LongType.get()), optional(5, "leafStructCol", leafStructType));
    Types.StructType structPrimTypeForList = Types.StructType.of(optional(506, "leafLongCol", Types.LongType.get()), optional(507, "leafBinaryCol", Types.BinaryType.get()));
    Types.StructType leafStructTypeForList = Types.StructType.of(optional(516, "leafLongCol", Types.LongType.get()), optional(517, "leafBinaryCol", Types.BinaryType.get()));
    Types.StructType nestedStructTypeForList = Types.StructType.of(optional(504, "longCol", Types.LongType.get()), optional(505, "leafStructCol", leafStructTypeForList));
    Types.StructType structPrimTypeForMap = Types.StructType.of(optional(606, "leafLongCol", Types.LongType.get()), optional(607, "leafBinaryCol", Types.BinaryType.get()));
    Types.StructType leafStructTypeForMap = Types.StructType.of(optional(616, "leafLongCol", Types.LongType.get()), optional(617, "leafBinaryCol", Types.BinaryType.get()));
    Types.StructType nestedStructTypeForMap = Types.StructType.of(optional(604, "longCol", Types.LongType.get()), optional(605, "leafStructCol", leafStructTypeForMap));
    Types.StructType leafStructTypeForStruct = Types.StructType.of(optional(716, "leafLongCol", Types.LongType.get()), optional(717, "leafBinaryCol", Types.BinaryType.get()));
    Types.StructType nestedStructTypeForStruct = Types.StructType.of(optional(704, "longCol", Types.LongType.get()), optional(705, "leafStructCol", leafStructTypeForStruct));
    // all fields in expected iceberg schema will be optional since we don't have a column mapping
    Schema expectedSchema = new Schema(optional(1, "intCol", Types.IntegerType.get()), optional(2, "longCol", Types.LongType.get()), optional(3, "nestedStructCol", nestedStructType), optional(8, "intCol3", Types.IntegerType.get()), optional(9, "doubleCol", Types.DoubleType.get()), required(10, "uuidCol", Types.UUIDType.get()), optional(20, "booleanCol", Types.BooleanType.get()), optional(21, "fixedCol", Types.FixedType.ofLength(4096)), required(22, "binaryCol", Types.BinaryType.get()), required(23, "stringCol", Types.StringType.get()), required(24, "decimalCol", Types.DecimalType.of(15, 3)), required(25, "floatCol", Types.FloatType.get()), optional(30, "dateCol", Types.DateType.get()), required(32, "timeCol", Types.TimeType.get()), required(34, "timestampCol", Types.TimestampType.withZone()), required(35, "listPrimCol", Types.ListType.ofRequired(135, Types.LongType.get())), required(36, "listPrimNestCol", Types.ListType.ofRequired(136, structPrimTypeForList)), required(37, "listNestedCol", Types.ListType.ofRequired(137, nestedStructTypeForList)), optional(38, "mapPrimCol", Types.MapType.ofRequired(138, 238, Types.StringType.get(), Types.FixedType.ofLength(4096))), required(39, "mapPrimNestCol", Types.MapType.ofRequired(139, 239, Types.StringType.get(), structPrimTypeForMap)), required(40, "mapNestedCol", Types.MapType.ofRequired(140, 240, Types.StringType.get(), nestedStructTypeForMap)), required(41, "structListNestCol", Types.ListType.ofRequired(241, Types.StructType.of(optional(816, "leafLongCol", Types.LongType.get()), optional(817, "leafBinaryCol", Types.BinaryType.get())))), required(42, "structMapNestCol", Types.MapType.ofRequired(242, 342, Types.StringType.get(), Types.StructType.of(optional(916, "leafLongCol", Types.LongType.get()), optional(917, "leafBinaryCol", Types.BinaryType.get())))), required(43, "structStructNestCol", Types.StructType.of(required(243, "innerStructNest", Types.StructType.of(optional(1016, "leafLongCol", Types.LongType.get()), optional(1017, "leafBinaryCol", Types.BinaryType.get()))))), required(44, "structStructComplexNestCol", Types.StructType.of(required(244, "innerStructNest", Types.StructType.of(optional(1116, "leafLongCol", Types.LongType.get()), optional(1117, "leftMapOfListStructCol", Types.MapType.ofRequired(1150, 1151, Types.StringType.get(), Types.ListType.ofRequired(1250, nestedStructTypeForStruct))))))));
    TypeDescription orcSchema = ORCSchemaUtil.convert(expectedSchema);
    replacedertEquals(expectedSchema.replacedtruct(), ORCSchemaUtil.convert(orcSchema).replacedtruct());
}

18 View Complete Implementation : TestORCSchemaUtil.java
Copyright Apache License 2.0
Author : apache
@Test
public void testRoundtripConversionPrimitive() {
    Schema expectedSchema = new Schema(optional(1, "intCol", Types.IntegerType.get()), optional(3, "longCol", Types.LongType.get()), optional(6, "intCol2", Types.IntegerType.get()), optional(20, "intCol3", Types.IntegerType.get()), required(9, "doubleCol", Types.DoubleType.get()), required(10, "uuidCol", Types.UUIDType.get()), optional(2, "booleanCol", Types.BooleanType.get()), optional(21, "fixedCol", Types.FixedType.ofLength(4096)), required(22, "binaryCol", Types.BinaryType.get()), required(23, "stringCol", Types.StringType.get()), required(24, "decimalCol", Types.DecimalType.of(15, 3)), required(25, "floatCol", Types.FloatType.get()), optional(30, "dateCol", Types.DateType.get()), required(32, "timeCol", Types.TimeType.get()), required(34, "timestampCol", Types.TimestampType.withZone()));
    TypeDescription orcSchema = ORCSchemaUtil.convert(expectedSchema);
    replacedertEquals(expectedSchema.replacedtruct(), ORCSchemaUtil.convert(orcSchema).replacedtruct());
}

18 View Complete Implementation : TestORCSchemaUtil.java
Copyright Apache License 2.0
Author : apache
@Test
public void testInvalidTypePromotions() {
    Schema originalSchema = new Schema(optional(1, "a", Types.LongType.get()));
    TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);
    Schema evolveSchema = new Schema(optional(1, "a", Types.IntegerType.get()));
    replacedertThrows("Should not allow invalid type promotion", IllegalArgumentException.clreplaced, "Can not promote", () -> {
        ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema);
    });
}

17 View Complete Implementation : OrcStructConverter.java
Copyright Apache License 2.0
Author : apache
/**
 * Convert a orc struct field of the "root" {@link OrcStruct} that represents the "row". This method has a cache of
 * field names to field index that is ONLY valid for this {@link OrcStruct}, and should not be used for
 * nested {@link OrcStruct} fields of the row. Looks up field index by field name, and delegates to
 * {@link OrcStructConverter#convertField(OrcStruct, int)}.
 */
@Nullable
Object convertRootField(OrcStruct struct, String fieldName) {
    // this cache is only valid for the root level, to skip the indexOf on fieldNames to get the fieldIndex.
    TypeDescription schema = struct.getSchema();
    final List<String> fields = schema.getFieldNames();
    if (fieldIndexCache == null) {
        fieldIndexCache = new Object2IntOpenHashMap<>(fields.size());
        for (int i = 0; i < fields.size(); i++) {
            fieldIndexCache.put(fields.get(i), i);
        }
    }
    int fieldIndex = fieldIndexCache.getOrDefault(fieldName, -1);
    return convertField(struct, fieldIndex);
}

17 View Complete Implementation : OrcBatchReader.java
Copyright Apache License 2.0
Author : apache
/**
 * Reads a vector of data into an array of objects.
 *
 * @param vals The array that needs to be filled.
 * @param fieldIdx If the vals array is an array of Row, the index of the field that needs to be filled.
 *                 Otherwise a -1 must be preplaceded and the data is directly filled into the array.
 * @param schema The schema of the vector to read.
 * @param vector The vector to read.
 * @param childCount The number of vector entries to read.
 */
private static void readField(Object[] vals, int fieldIdx, TypeDescription schema, ColumnVector vector, int childCount) {
    // check the type of the vector to decide how to read it.
    switch(schema.getCategory()) {
        case BOOLEAN:
            if (vector.noNulls) {
                readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readBoolean);
            } else {
                readLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readBoolean);
            }
            break;
        case BYTE:
            if (vector.noNulls) {
                readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readByte);
            } else {
                readLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readByte);
            }
            break;
        case SHORT:
            if (vector.noNulls) {
                readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readShort);
            } else {
                readLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readShort);
            }
            break;
        case INT:
            if (vector.noNulls) {
                readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readInt);
            } else {
                readLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readInt);
            }
            break;
        case LONG:
            if (vector.noNulls) {
                readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readLong);
            } else {
                readLongColumn(vals, fieldIdx, (LongColumnVector) vector, childCount, OrcBatchReader::readLong);
            }
            break;
        case FLOAT:
            if (vector.noNulls) {
                readNonNullDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, childCount, OrcBatchReader::readFloat);
            } else {
                readDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, childCount, OrcBatchReader::readFloat);
            }
            break;
        case DOUBLE:
            if (vector.noNulls) {
                readNonNullDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, childCount, OrcBatchReader::readDouble);
            } else {
                readDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, childCount, OrcBatchReader::readDouble);
            }
            break;
        case CHAR:
        case VARCHAR:
        case STRING:
            if (vector.noNulls) {
                readNonNullBytesColumnreplacedtring(vals, fieldIdx, (BytesColumnVector) vector, childCount);
            } else {
                readBytesColumnreplacedtring(vals, fieldIdx, (BytesColumnVector) vector, childCount);
            }
            break;
        case DATE:
            if (vector.noNulls) {
                readNonNullLongColumnAsDate(vals, fieldIdx, (LongColumnVector) vector, childCount);
            } else {
                readLongColumnAsDate(vals, fieldIdx, (LongColumnVector) vector, childCount);
            }
            break;
        case TIMESTAMP:
            if (vector.noNulls) {
                readNonNullTimestampColumn(vals, fieldIdx, (TimestampColumnVector) vector, childCount);
            } else {
                readTimestampColumn(vals, fieldIdx, (TimestampColumnVector) vector, childCount);
            }
            break;
        case BINARY:
            if (vector.noNulls) {
                readNonNullBytesColumnAsBinary(vals, fieldIdx, (BytesColumnVector) vector, childCount);
            } else {
                readBytesColumnAsBinary(vals, fieldIdx, (BytesColumnVector) vector, childCount);
            }
            break;
        case DECIMAL:
            if (vector.noNulls) {
                readNonNullDecimalColumn(vals, fieldIdx, (DecimalColumnVector) vector, childCount);
            } else {
                readDecimalColumn(vals, fieldIdx, (DecimalColumnVector) vector, childCount);
            }
            break;
        case STRUCT:
            if (vector.noNulls) {
                readNonNullStructColumn(vals, fieldIdx, (StructColumnVector) vector, schema, childCount);
            } else {
                readStructColumn(vals, fieldIdx, (StructColumnVector) vector, schema, childCount);
            }
            break;
        case LIST:
            if (vector.noNulls) {
                readNonNullListColumn(vals, fieldIdx, (ListColumnVector) vector, schema, childCount);
            } else {
                readListColumn(vals, fieldIdx, (ListColumnVector) vector, schema, childCount);
            }
            break;
        case MAP:
            if (vector.noNulls) {
                readNonNullMapColumn(vals, fieldIdx, (MapColumnVector) vector, schema, childCount);
            } else {
                readMapColumn(vals, fieldIdx, (MapColumnVector) vector, schema, childCount);
            }
            break;
        case UNION:
            throw new UnsupportedOperationException("UNION type not supported yet");
        default:
            throw new IllegalArgumentException("Unknown type " + schema);
    }
}

17 View Complete Implementation : OrcKeyComparatorTest.java
Copyright Apache License 2.0
Author : apache
private OrcUnion createOrcUnion(TypeDescription schema, WritableComparable value) {
    OrcUnion result = new OrcUnion(schema);
    result.set(0, value);
    return result;
}

17 View Complete Implementation : OrcKeyComparatorTest.java
Copyright Apache License 2.0
Author : apache
private OrcStruct createSimpleOrcStruct(TypeDescription structSchema, int value1, int value2) {
    OrcStruct result = new OrcStruct(structSchema);
    result.setFieldValue(0, new IntWritable(value1));
    result.setFieldValue(1, new IntWritable(value2));
    return result;
}

17 View Complete Implementation : OrcValueMapperTest.java
Copyright Apache License 2.0
Author : apache
@Test
public void testIsEvolutionValid() {
    TypeDescription schema_1 = TypeDescription.fromString("struct<i:int,j:int,k:int>");
    TypeDescription schema_2 = TypeDescription.fromString("struct<i:int,j:int,k:bigint>");
    TypeDescription schema_3 = TypeDescription.fromString("struct<i:int,j:int,k:tinyint>");
    TypeDescription schema_4 = TypeDescription.fromString("struct<i:int,j:int>");
    replacedert.replacedertTrue(OrcValueMapper.isEvolutionValid(schema_1, schema_2));
    replacedert.replacedertTrue(OrcValueMapper.isEvolutionValid(schema_1, schema_3));
    replacedert.replacedertTrue(OrcValueMapper.isEvolutionValid(schema_1, schema_4));
    replacedert.replacedertTrue(OrcValueMapper.isEvolutionValid(schema_4, schema_1));
}

17 View Complete Implementation : GenericOrcWriter.java
Copyright Apache License 2.0
Author : apache
private static Converter[] buildConverters(TypeDescription schema) {
    if (schema.getCategory() != TypeDescription.Category.STRUCT) {
        throw new IllegalArgumentException("Top level must be a struct " + schema);
    }
    List<TypeDescription> children = schema.getChildren();
    Converter[] result = new Converter[children.size()];
    for (int c = 0; c < children.size(); ++c) {
        result[c] = buildConverter(children.get(c));
    }
    return result;
}

17 View Complete Implementation : OrcIterable.java
Copyright Apache License 2.0
Author : apache
private static VectorizedRowBatchIterator newOrcIterator(InputFile file, TypeDescription readerSchema, Long start, Long length, Reader orcFileReader) {
    final Reader.Options options = orcFileReader.options();
    if (start != null) {
        options.range(start, length);
    }
    options.schema(readerSchema);
    try {
        return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options));
    } catch (IOException ioe) {
        throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
    }
}