org.apache.tika.Tika - java examples

Here are the examples of the java api org.apache.tika.Tika taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

109 Examples 7

19 View Complete Implementation : AdvancedTypeDetector.java
Copyright GNU General Public License v2.0
Author : SOBotics
public static String detectWithCustomConfig(String name) throws Exception {
    String config = "/org/apache/tika/mime/tika-mimetypes.xml";
    Tika tika = new Tika(MimeTypesFactory.create(config));
    return tika.detect(name);
}

19 View Complete Implementation : ForkParserIntegrationTest.java
Copyright GNU General Public License v2.0
Author : SOBotics
/**
 * Test that the ForkParser correctly behaves when
 *  wired in to the regular Parsers and their test data
 */
public clreplaced ForkParserIntegrationTest {

    // TODO Use TikaConfig instead, when it works
    private Tika tika = new Tika();

    /**
     * Simple text parsing
     */
    @Test
    public void testForkedTextParsing() throws Exception {
        ForkParser parser = new ForkParser(ForkParserIntegrationTest.clreplaced.getClreplacedLoader(), tika.getParser());
        try {
            ContentHandler output = new BodyContentHandler();
            InputStream stream = ForkParserIntegrationTest.clreplaced.getResourcereplacedtream("/test-doreplacedents/testTXT.txt");
            ParseContext context = new ParseContext();
            parser.parse(stream, output, new Metadata(), context);
            String content = output.toString();
            replacedertContains("Test d'indexation", content);
            replacedertContains("http://www.apache.org", content);
        } finally {
            parser.close();
        }
    }

    /**
     * This error has a message and an equals() implementation as to be able
     * to match it against the serialized version of itself.
     */
    static clreplaced AnError extends Error {

        private static final long serialVersionUID = -6197267350768803348L;

        private String message;

        AnError(String message) {
            super(message);
            this.message = message;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClreplaced() != o.getClreplaced())
                return false;
            AnError anError = (AnError) o;
            if (!message.equals(anError.message))
                return false;
            return true;
        }

        @Override
        public int hashCode() {
            return message.hashCode();
        }
    }

    /**
     * This error isn't serializable on the server, so can't be sent back
     *  to the Fork Client once it has occured
     */
    static clreplaced WontBeSerializedError extends RuntimeException {

        private static final long serialVersionUID = 1L;

        WontBeSerializedError(String message) {
            super(message);
        }

        private void writeObject(java.io.ObjectOutputStream out) {
            RuntimeException e = new RuntimeException("Bang!");
            boolean found = false;
            for (StackTraceElement ste : e.getStackTrace()) {
                if (ste.getClreplacedName().equals(ForkParser.clreplaced.getName())) {
                    found = true;
                    break;
                }
            }
            if (!found) {
                throw e;
            }
        }
    }

    static clreplaced BrokenParser implements Parser {

        private static final long serialVersionUID = 995871497930817839L;

        public Error err = new AnError("Simulated fail");

        public RuntimeException re = null;

        public Set<MediaType> getSupportedTypes(ParseContext context) {
            return new HashSet<MediaType>(Arrays.asList(MediaType.TEXT_PLAIN));
        }

        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
            if (re != null)
                throw re;
            throw err;
        }
    }

    /**
     * TIKA-831 Parsers throwing errors should be caught and
     *  properly reported
     */
    @Test
    public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
        BrokenParser brokenParser = new BrokenParser();
        Parser parser = new ForkParser(ForkParser.clreplaced.getClreplacedLoader(), brokenParser);
        InputStream stream = getClreplaced().getResourcereplacedtream("/test-doreplacedents/testTXT.txt");
        // With a serializable error, we'll get that back
        try {
            ContentHandler output = new BodyContentHandler();
            ParseContext context = new ParseContext();
            parser.parse(stream, output, new Metadata(), context);
            fail("Expected TikaException caused by Error");
        } catch (TikaException e) {
            replacedertEquals(brokenParser.err, e.getCause());
        }
        // With a non serializable one, we'll get something else
        // TODO Fix this test
        brokenParser = new BrokenParser();
        brokenParser.re = new WontBeSerializedError("Can't Serialize");
        parser = new ForkParser(ForkParser.clreplaced.getClreplacedLoader(), brokenParser);
    // try {
    // ContentHandler output = new BodyContentHandler();
    // ParseContext context = new ParseContext();
    // parser.parse(stream, output, new Metadata(), context);
    // fail("Expected TikaException caused by Error");
    // } catch (TikaException e) {
    // replacedertEquals(TikaException.clreplaced, e.getCause().getClreplaced());
    // replacedertEquals("Bang!", e.getCause().getMessage());
    // }
    }

    /**
     * If we supply a non serializable object on the ParseContext,
     *  check we get a helpful exception back
     */
    @Test
    public void testParserHandlingOfNonSerializable() throws Exception {
        ForkParser parser = new ForkParser(ForkParserIntegrationTest.clreplaced.getClreplacedLoader(), tika.getParser());
        ParseContext context = new ParseContext();
        context.set(Detector.clreplaced, new Detector() {

            public MediaType detect(InputStream input, Metadata metadata) {
                return MediaType.OCTET_STREAM;
            }
        });
        try {
            ContentHandler output = new BodyContentHandler();
            InputStream stream = ForkParserIntegrationTest.clreplaced.getResourcereplacedtream("/test-doreplacedents/testTXT.txt");
            parser.parse(stream, output, new Metadata(), context);
            fail("Should have blown up with a non serializable ParseContext");
        } catch (TikaException e) {
            // Check the right details
            replacedertNotNull(e.getCause());
            replacedertEquals(NotSerializableException.clreplaced, e.getCause().getClreplaced());
            replacedertEquals("Unable to serialize ParseContext to preplaced to the Forked Parser", e.getMessage());
        } finally {
            parser.close();
        }
    }

    /**
     * TIKA-832
     */
    @Test
    public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
        ParseContext context = new ParseContext();
        context.set(Parser.clreplaced, tika.getParser());
        ForkParser parser = new ForkParser(ForkParserIntegrationTest.clreplaced.getClreplacedLoader(), tika.getParser());
        parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
        try {
            ContentHandler body = new BodyContentHandler();
            InputStream stream = ForkParserIntegrationTest.clreplaced.getResourcereplacedtream("/test-doreplacedents/testTXT.txt");
            parser.parse(stream, body, new Metadata(), context);
            String content = body.toString();
            replacedertContains("Test d'indexation", content);
            replacedertContains("http://www.apache.org", content);
        } finally {
            parser.close();
        }
    }

    /**
     * TIKA-808 - Ensure that parsing of our test PDFs work under
     * the Fork Parser, to ensure that complex parsing behaves
     */
    @Test
    public void testForkedPDFParsing() throws Exception {
        ForkParser parser = new ForkParser(ForkParserIntegrationTest.clreplaced.getClreplacedLoader(), tika.getParser());
        try {
            ContentHandler output = new BodyContentHandler();
            InputStream stream = ForkParserIntegrationTest.clreplaced.getResourcereplacedtream("/test-doreplacedents/testPDF.pdf");
            ParseContext context = new ParseContext();
            parser.parse(stream, output, new Metadata(), context);
            String content = output.toString();
            replacedertContains("Apache Tika", content);
            replacedertContains("Tika - Content replacedysis Toolkit", content);
            replacedertContains("incubator", content);
            replacedertContains("Apache Software Foundation", content);
        } finally {
            parser.close();
        }
    }
}

19 View Complete Implementation : SimpleTypeDetector.java
Copyright GNU General Public License v2.0
Author : SOBotics
public static void main(String[] args) throws Exception {
    Tika tika = new Tika();
    for (String file : args) {
        String type = tika.detect(new File(file));
        System.out.println(file + ": " + type);
    }
}

19 View Complete Implementation : TikaHolder.java
Copyright Apache License 2.0
Author : lbroudoux
/**
 * Simple singleton holder for Apache Tika.
 * @author laurent
 */
public clreplaced TikaHolder {

    private static final Tika tika = new Tika();

    /**
     * @return This holder singleton's instance.
     */
    public static Tika tika() {
        return tika;
    }
}

19 View Complete Implementation : ContentDetector.java
Copyright MIT License
Author : theonedev
public clreplaced ContentDetector {

    private static final Tika tika = new Tika();

    /**
     *  Read leading information of specified stream until the charset is detected.
     *
     *  @param contentStream
     * 			stream to be read for charset detection
     *  @return
     *  			detected charset, or <tt>null</tt> if charset can not be detected
     */
    @Nullable
    public static Charset detectCharset(InputStream contentStream) {
        try {
            return UniversalEncodingDetector.detect(contentStream);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Read leading information of specified content bytes to detect content charset.
     *
     * @param contentBytes
     * 			content to be detected
     * @return
     * 			charset of the content, or <tt>null</tt> if charset can not be detected
     */
    @Nullable
    public static Charset detectCharset(byte[] contentBytes) {
        if (contentBytes.length != 0) {
            try {
                return UniversalEncodingDetector.detect(contentBytes);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        } else {
            return null;
        }
    }

    public static boolean isBinary(byte[] contentBytes, @Nullable String fileName) {
        if (contentBytes.length == 0)
            return false;
        MediaType mediaType = detectMediaType(contentBytes, fileName);
        return !mediaType.getType().equalsIgnoreCase("text") && !mediaType.equals(MediaType.application("xhtml+xml")) && !mediaType.equals(MediaType.APPLICATION_XML) && !mediaType.equals(MediaType.application("json")) && !mediaType.equals(MediaType.application("x-sh")) && !mediaType.equals(MediaType.application("javascript")) && !mediaType.equals(MediaType.application("x-httpd-jsp")) && !mediaType.equals(MediaType.application("x-httpd-php"));
    }

    /**
     * Get text from specified content bytes, optionally with help of file name.
     *
     * @param contentBytes
     * 			content bytes to construct text from
     * @param fileName
     * 			file name to help deciding if supplied content bytes represents text
     * @return
     * 			text representation of content bytes, or <tt>null</tt> if content
     * 			can not be converted to text
     */
    @Nullable
    public static String convertToText(byte[] contentBytes, @Nullable String fileName) {
        if (!isBinary(contentBytes, fileName)) {
            Charset charset = detectCharset(contentBytes);
            if (charset != null)
                return new String(contentBytes, charset);
            else
                return new String(contentBytes);
        } else {
            return null;
        }
    }

    public static MediaType detectMediaType(byte[] contentBytes, @Nullable String fileName) {
        return MediaType.parse(tika.detect(contentBytes, fileName));
    }

    public static MediaType detectMediaType(InputStream contentStream, @Nullable String fileName) {
        try {
            return MediaType.parse(tika.detect(contentStream, fileName));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}

19 View Complete Implementation : FileTypeDetector.java
Copyright Apache License 2.0
Author : snowflakedb
/**
 * Use Tika to detect the mime type of files
 *
 * @author jhuang
 */
public clreplaced FileTypeDetector extends java.nio.file.spi.FileTypeDetector {

    private final Tika tika = new Tika();

    @Override
    public String probeContentType(Path path) throws IOException {
        return tika.detect(path.toFile());
    }
}

19 View Complete Implementation : PoiMicrosoftFileReader.java
Copyright Apache License 2.0
Author : mark-watson
public static String DocxToText(String docxFilePath) throws IOException, InvalidFormatException, XmlException, TikaException {
    String ret = "";
    FileInputStream fis = new FileInputStream(docxFilePath);
    Tika tika = new Tika();
    ret = tika.parseToString(fis);
    fis.close();
    return ret;
}

19 View Complete Implementation : IOUtil.java
Copyright GNU Lesser General Public License v2.1
Author : lucee
public static String getMimeType(String fileName, String defaultValue) {
    try {
        Tika tika = new Tika();
        return tika.detect(fileName);
    } catch (Exception e) {
        return defaultValue;
    }
}

19 View Complete Implementation : RTFParserTest.java
Copyright GNU General Public License v2.0
Author : SOBotics
/**
 * Junit test clreplaced for the Tika {@link RTFParser}
 */
public clreplaced RTFParserTest extends TikaTest {

    private Tika tika = new Tika();

    @Test
    public void testBasicExtraction() throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/testRTF.rtf");
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        tika.getParser().parse(new FileInputStream(file), new WriteOutContentHandler(writer), metadata, new ParseContext());
        String content = writer.toString();
        replacedertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));
        replacedertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
        replacedertContains("Test", content);
        replacedertContains("indexation Word", content);
    }

    @Test
    public void testUmlautSpacesExtraction2() throws Exception {
        String content = getText("testRTFUmlautSpaces2.rtf");
        content = content.replaceAll("\\s+", "");
        replacedertEquals("\u00DCbersicht", content);
    }

    @Test
    public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
        String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
        replacedertContains("\u5E74", content);
        replacedertContains("\u5ff5", content);
        replacedertContains("0 ", content);
        replacedertContains("abc", content);
        replacedertFalse("Doubled character \u5E74", content.contains("\u5E74\u5E74"));
    }

    @Test
    public void testHexEscapeInsideWord() throws Exception {
        String content = getText("testRTFHexEscapeInsideWord.rtf");
        replacedertContains("ESP\u00cdRITO", content);
    }

    @Test
    public void testWindowsCodepage1250() throws Exception {
        String content = getText("testRTFWindowsCodepage1250.rtf");
        replacedertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content);
        replacedertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content);
    }

    @Test
    public void testTableCellSeparation() throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/testRTFTableCellSeparation.rtf");
        String content = tika.parseToString(file);
        content = content.replaceAll("\\s+", " ");
        replacedertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
        replacedertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
    }

    @Test
    public void testTableCellSeparation2() throws Exception {
        String content = getText("testRTFTableCellSeparation2.rtf");
        // TODO: why do we insert extra whitespace...?
        content = content.replaceAll("\\s+", " ");
        replacedertContains("Station Fax", content);
    }

    @Test
    public void testWordPadCzechCharactersExtraction() throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/testRTFWordPadCzechCharacters.rtf");
        String s1 = tika.parseToString(file);
        replacedertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
        replacedertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
    }

    @Test
    public void testWord2010CzechCharactersExtraction() throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/testRTFWord2010CzechCharacters.rtf");
        String s1 = tika.parseToString(file);
        replacedertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
        replacedertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
    }

    @Test
    public void testMS932Extraction() throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/testRTF-ms932.rtf");
        String s1 = tika.parseToString(file);
        // Hello in replacedanese
        replacedertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
        // Verify replacedle, since it was also encoded with MS932:
        Result r = getResult("testRTF-ms932.rtf");
        replacedertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.replacedLE));
    }

    @Test
    public void testUmlautSpacesExtraction() throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/testRTFUmlautSpaces.rtf");
        String s1 = tika.parseToString(file);
        replacedertTrue(s1.contains("\u00DCbersicht"));
    }

    @Test
    public void testGothic() throws Exception {
        String content = getText("testRTFUnicodeGothic.rtf");
        replacedertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
    }

    @Test
    public void testreplacedaneseText() throws Exception {
        Result r = getResult("testRTFreplacedanese.rtf");
        String content = r.text;
        // Verify replacedle -- this replacedle uses upr escape inside
        // replacedle info field:
        replacedertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000", r.metadata.get(TikaCoreProperties.replacedLE));
        replacedertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
        replacedertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
        replacedertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
        // Special version of (GHQ)
        replacedertContains("\uff08\uff27\uff28\uff31\uff09", content);
        // 6 other characters
        replacedertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content);
    }

    @Test
    public void testMaxLength() throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/testRTFreplacedanese.rtf");
        Metadata metadata = new Metadata();
        InputStream stream = TikaInputStream.get(file, metadata);
        // Test w/ default limit:
        Tika localTika = new Tika();
        String content = localTika.parseToString(stream, metadata);
        // parseToString closes for convenience:
        // stream.close();
        replacedertTrue(content.length() > 500);
        // Test setting max length on the instance:
        localTika.setMaxStringLength(200);
        stream = TikaInputStream.get(file, metadata);
        content = localTika.parseToString(stream, metadata);
        // parseToString closes for convenience:
        // stream.close();
        replacedertTrue(content.length() <= 200);
        // Test setting max length per-call:
        stream = TikaInputStream.get(file, metadata);
        content = localTika.parseToString(stream, metadata, 100);
        // parseToString closes for convenience:
        // stream.close();
        replacedertTrue(content.length() <= 100);
    }

    @Test
    public void testTextWithCurlyBraces() throws Exception {
        String content = getText("testRTFWithCurlyBraces.rtf");
        replacedertContains("{ some text inside curly brackets }", content);
    }

    @Test
    public void testControls() throws Exception {
        Result r = getResult("testRTFControls.rtf");
        String content = r.text;
        replacedertContains("Thiswordhasanem\u2014dash", content);
        replacedertContains("Thiswordhasanen\u2013dash", content);
        replacedertContains("Thiswordhasanon\u2011breakinghyphen", content);
        replacedertContains("Thiswordhasanonbreaking\u00a0space", content);
        replacedertContains("Thiswordhasanoptional\u00adhyphen", content);
        replacedertContains("\u2018Single quoted text\u2019", content);
        replacedertContains("\u201cDouble quoted text\u201d", content);
        replacedertContains("\u201cDouble quoted text again\u201d", content);
    }

    @Test
    public void testInvalidUnicode() throws Exception {
        Result r = getResult("testRTFInvalidUnicode.rtf");
        String content = r.text;
        replacedertContains("Unpaired hi \ufffd here", content);
        replacedertContains("Unpaired lo \ufffd here", content);
        replacedertContains("Mismatched pair \ufffd\ufffd here", content);
    }

    @Test
    public void testVarious() throws Exception {
        Result r = getResult("testRTFVarious.rtf");
        String content = r.text;
        replacedertContains("Footnote appears here", content);
        replacedertContains("This is a footnote.", content);
        replacedertContains("This is the header text.", content);
        replacedertContains("This is the footer text.", content);
        replacedertContains("Here is a text box", content);
        replacedertContains("Bold", content);
        replacedertContains("italic", content);
        replacedertContains("underline", content);
        replacedertContains("superscript", content);
        replacedertContains("subscript", content);
        replacedertContains("Here is a citation:", content);
        replacedertContains("Figure 1 This is a caption for Figure 1", content);
        replacedertContains("(Kramer)", content);
        // Table
        replacedertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
        // 2-columns
        replacedertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
        replacedertContains("This is a hyperlink", content);
        replacedertContains("Here is a list:", content);
        for (int row = 1; row <= 3; row++) {
            replacedertContains("Bullet " + row, content);
        }
        replacedertContains("Here is a numbered list:", content);
        for (int row = 1; row <= 3; row++) {
            replacedertContains("Number bullet " + row, content);
        }
        for (int row = 1; row <= 2; row++) {
            for (int col = 1; col <= 3; col++) {
                replacedertContains("Row " + row + " Col " + col, content);
            }
        }
        replacedertContains("Keyword1 Keyword2", content);
        replacedertEquals("Keyword1 Keyword2", r.metadata.get(TikaCoreProperties.KEYWORDS));
        replacedertContains("Subject is here", content);
        replacedertEquals("Subject is here", r.metadata.get(OfficeOpenXMLCore.SUBJECT));
        replacedertEquals("Subject is here", r.metadata.get(Metadata.SUBJECT));
        replacedertContains("Suddenly some replacedanese text:", content);
        // Special version of (GHQ)
        replacedertContains("\uff08\uff27\uff28\uff31\uff09", content);
        // 6 other characters
        replacedertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
        replacedertContains("And then some Gothic text:", content);
        replacedertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
    }

    @Test
    public void testVariousStyle() throws Exception {
        String content = getXML("testRTFVarious.rtf").xml;
        replacedertContains("<b>Bold</b>", content);
        replacedertContains("<i>italic</i>", content);
    }

    @Test
    public void testBoldItalic() throws Exception {
        String content = getXML("testRTFBoldItalic.rtf").xml;
        replacedertContains("<b>bold</b>", content);
        replacedertContains("<b>bold </b><b><i>italic</i></b>", content);
        replacedertContains("<b><i>italic </i></b><b>bold</b>", content);
        replacedertContains("<i>italic</i>", content);
        replacedertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
        replacedertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
    }

    @Test
    public void testHyperlink() throws Exception {
        String content = getXML("testRTFHyperlink.rtf").xml;
        replacedertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
        replacedertEquals(-1, content.indexOf("<p>\t\t</p>"));
    }

    @Test
    public void testIgnoredControlWord() throws Exception {
        replacedertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
    }

    @Test
    public void testFontAfterBufferedText() throws Exception {
        replacedertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!", getXML("testFontAfterBufferedText.rtf").xml);
    }

    @Test
    public void testListMicrosoftWord() throws Exception {
        String content = getXML("testRTFListMicrosoftWord.rtf").xml;
        replacedertContains("<ol>\t<li>one</li>", content);
        replacedertContains("</ol>", content);
        replacedertContains("<ul>\t<li>first</li>", content);
        replacedertContains("</ul>", content);
    }

    @Test
    public void testListLibreOffice() throws Exception {
        String content = getXML("testRTFListLibreOffice.rtf").xml;
        replacedertContains("<ol>\t<li>one</li>", content);
        replacedertContains("</ol>", content);
        replacedertContains("<ul>\t<li>first</li>", content);
        replacedertContains("</ul>", content);
    }

    // TIKA-782
    @Test
    public void testBinControlWord() throws Exception {
        ByteCopyingHandler embHandler = new ByteCopyingHandler();
        try (TikaInputStream tis = TikaInputStream.get(getResourcereplacedtream("/test-doreplacedents/testBinControlWord.rtf"))) {
            ContainerExtractor ex = new ParserContainerExtractor();
            replacedertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, embHandler);
        }
        replacedertEquals(1, embHandler.bytes.size());
        byte[] bytes = embHandler.bytes.get(0);
        replacedertEquals(10, bytes.length);
        // }
        replacedertEquals(125, (int) bytes[4]);
        // make sure that at least the last value is correct
        replacedertEquals(-1, (int) bytes[9]);
    }

    // TIKA-999
    @Test
    public void testMetaDataCounts() throws Exception {
        XMLResult xml = getXML("test_embedded_package.rtf");
        replacedertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
        replacedertEquals("7", xml.metadata.get(Office.WORD_COUNT));
        replacedertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
        replacedertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
    }

    // TIKA-1192
    @Test
    public void testListOverride() throws Exception {
        Result r = getResult("testRTFListOverride.rtf");
        String content = r.text;
        replacedertContains("Body", content);
    }

    // TIKA-1305
    @Test
    public void testCorruptListOverride() throws Exception {
        Result r = getResult("testRTFCorruptListOverride.rtf");
        String content = r.text;
        replacedertContains("apple", content);
    }

    // TIKA-1010
    @Test
    public void testEmbeddedMonster() throws Exception {
        Set<MediaType> skipTypes = new HashSet<MediaType>();
        skipTypes.add(MediaType.parse("application/x-emf"));
        skipTypes.add(MediaType.parse("application/x-msmetafile"));
        List<String> trueNames = new ArrayList<String>();
        trueNames.add("file_0.doc");
        trueNames.add("Hw.txt");
        trueNames.add("file_1.xlsx");
        trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip");
        trueNames.add("html-within-zip.zip");
        trueNames.add("text.html");
        trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html");
        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
        trueNames.add("file_2.xls");
        trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg");
        trueNames.add("file_3.pdf");
        trueNames.add("file_4.ppt");
        trueNames.add("file_5.pptx");
        trueNames.add("thumbnail.jpeg");
        trueNames.add("file_6.doc");
        trueNames.add("file_7.doc");
        trueNames.add("file_8.docx");
        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
        List<String> trueTypes = new ArrayList<String>();
        trueTypes.add("application/msword");
        trueTypes.add("text/plain");
        trueTypes.add("application/vnd.openxmlformats-officedoreplacedent.spreadsheetml.sheet");
        trueTypes.add("application/zip");
        trueTypes.add("application/zip");
        trueTypes.add("text/html");
        trueTypes.add("text/html");
        trueTypes.add("image/jpeg");
        trueTypes.add("application/vnd.ms-excel");
        trueTypes.add("application/vnd.ms-outlook");
        trueTypes.add("application/pdf");
        trueTypes.add("application/vnd.ms-powerpoint");
        trueTypes.add("application/vnd.openxmlformats-officedoreplacedent.presentationml.presentation");
        trueTypes.add("image/jpeg");
        trueTypes.add("application/msword");
        trueTypes.add("application/msword");
        trueTypes.add("application/vnd.openxmlformats-officedoreplacedent.wordprocessingml.doreplacedent");
        trueTypes.add("image/jpeg");
        TrackingHandler tracker = new TrackingHandler(skipTypes);
        try (TikaInputStream tis = TikaInputStream.get(getResourcereplacedtream("/test-doreplacedents/testRTFEmbeddedFiles.rtf"))) {
            ContainerExtractor ex = new ParserContainerExtractor();
            replacedertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);
        }
        replacedertEquals(trueNames.size(), tracker.filenames.size());
        replacedertEquals(trueTypes.size(), tracker.mediaTypes.size());
        for (int i = 0; i < tracker.filenames.size(); i++) {
            String expectedName = trueNames.get(i);
            if (expectedName == null) {
                replacedertNull(tracker.filenames.get(i));
            } else {
                replacedertNotNull(tracker.filenames.get(i));
                // necessary to getName() because MSOffice extractor includes
                // directory: _1457338524/HW.txt
                replacedertEquals("filename equals ", expectedName, FilenameUtils.getName(tracker.filenames.get(i)));
            }
            replacedertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString());
        }
        tracker = new TrackingHandler();
        try (TikaInputStream tis = TikaInputStream.get(getResourcereplacedtream("/test-doreplacedents/testRTFEmbeddedFiles.rtf"))) {
            ContainerExtractor ex = new ParserContainerExtractor();
            replacedertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);
        }
        replacedertEquals(47, tracker.filenames.size());
        replacedertEquals("thumbnail_26.emf", tracker.filenames.get(45));
        replacedertEquals("thumbnail_27.wmf", tracker.filenames.get(46));
    }

    // TIKA-1010 test regular (not "embedded") images/picts
    @Test
    public void testRegularImages() throws Exception {
        Parser base = new AutoDetectParser();
        ParseContext ctx = new ParseContext();
        RecursiveParserWrapper parser = new RecursiveParserWrapper(base, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ctx.set(org.apache.tika.parser.Parser.clreplaced, parser);
        ContentHandler handler = new BodyContentHandler();
        Metadata rootMetadata = new Metadata();
        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
        try (TikaInputStream tis = TikaInputStream.get(getResourcereplacedtream("/test-doreplacedents/testRTFRegularImages.rtf"))) {
            parser.parse(tis, handler, rootMetadata, ctx);
        }
        List<Metadata> metadatas = parser.getMetadata();
        // ("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
        Metadata meta_jpg_exif = metadatas.get(1);
        // ("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
        Metadata meta_jpg = metadatas.get(3);
        replacedertTrue(meta_jpg_exif != null);
        replacedertTrue(meta_jpg != null);
        replacedertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
        replacedertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
        // make sure old metadata doesn't linger between objects
        replacedertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
        replacedertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
        replacedertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
        replacedertEquals(46, meta_jpg.names().length);
        replacedertEquals(110, meta_jpg_exif.names().length);
    }

    @Test
    public void testMultipleNewlines() throws Exception {
        String content = getXML("testRTFNewlines.rtf").xml;
        content = content.replaceAll("[\r\n]+", " ");
        replacedertContains("<body><p>one</p> " + "<p /> " + "<p>two</p> " + "<p /> " + "<p /> " + "<p>three</p> " + "<p /> " + "<p /> " + "<p /> " + "<p>four</p>", content);
    }

    // TIKA-1010 test linked embedded doc
    @Test
    public void testEmbeddedLinkedDoreplacedent() throws Exception {
        Set<MediaType> skipTypes = new HashSet<MediaType>();
        skipTypes.add(MediaType.parse("application/x-emf"));
        skipTypes.add(MediaType.parse("application/x-msmetafile"));
        TrackingHandler tracker = new TrackingHandler(skipTypes);
        try (TikaInputStream tis = TikaInputStream.get(getResourcereplacedtream("/test-doreplacedents/testRTFEmbeddedLink.rtf"))) {
            ContainerExtractor ex = new ParserContainerExtractor();
            replacedertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);
        }
        // should gracefully skip link and not throw NPE, IOEx, etc
        replacedertEquals(0, tracker.filenames.size());
        tracker = new TrackingHandler();
        try (TikaInputStream tis = TikaInputStream.get(getResourcereplacedtream("/test-doreplacedents/testRTFEmbeddedLink.rtf"))) {
            ContainerExtractor ex = new ParserContainerExtractor();
            replacedertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);
        }
        // should gracefully skip link and not throw NPE, IOEx, etc
        replacedertEquals(2, tracker.filenames.size());
    }

    private Result getResult(String filename) throws Exception {
        File file = getResourceAsFile("/test-doreplacedents/" + filename);
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        tika.getParser().parse(new FileInputStream(file), new WriteOutContentHandler(writer), metadata, new ParseContext());
        String content = writer.toString();
        return new Result(content, metadata);
    }

    private String getText(String filename) throws Exception {
        return getResult(filename).text;
    }

    private static clreplaced Result {

        public final String text;

        public final Metadata metadata;

        public Result(String text, Metadata metadata) {
            this.text = text;
            this.metadata = metadata;
        }
    }
}

19 View Complete Implementation : TikaParserTest.java
Copyright BSD 2-Clause "Simplified" License
Author : rohanpadhye
@Fuzz
public void fuzz(@From(InputStreamGenerator.clreplaced) InputStream in) throws IOException {
    Tika tika = new Tika();
    try (Reader reader = tika.parse(in)) {
        char[] buf = new char[1024];
        // Keep reading until EOF
        while (reader.read(buf) != -1) ;
    }
}

19 View Complete Implementation : ImageUtil.java
Copyright GNU General Public License v3.0
Author : scriptkittie
public static String getImageType(byte[] inputByteArray) throws Exception {
    Tika tika = new Tika();
    return tika.detect(inputByteArray);
}

19 View Complete Implementation : DirectoryFileService.java
Copyright Apache License 2.0
Author : RyanSusana
public clreplaced DirectoryFileService implements FileService {

    private final String rootFolderLocation;

    private final Tika tika = new Tika();

    public DirectoryFileService(String rootFolderLocation) {
        this.rootFolderLocation = rootFolderLocation;
        ensureRootFolderExists();
    }

    private static String replaceNMatches(String input, String regex, String replacement, int numberOfTimes) {
        final var quoteReplacement = Matcher.quoteReplacement(replacement);
        Matcher m = Pattern.compile(regex).matcher(input);
        StringBuilder sb = new StringBuilder();
        int i = 0;
        while (i++ < numberOfTimes && m.find()) {
            m.appendReplacement(sb, quoteReplacement);
        }
        m.appendTail(sb);
        return sb.toString();
    }

    void ensureRootFolderExists() {
        Path path = Paths.get(rootFolderLocation);
        try {
            Files.createDirectories(path);
        } catch (IOException e) {
            throw new ElepyConfigException("Can't create upload folder");
        }
    }

    @Override
    public synchronized void uploadFile(FileUpload file) {
        final Path path = Paths.get(rootFolderLocation + File.separator + decodeFileName(file.getName()));
        try {
            Files.createDirectories(path.getParent() == null ? path : path.getParent());
            Files.copy(file.getContent(), path);
        } catch (FileAlreadyExistsException e) {
            throw new ElepyException("FileReference Already Exists: " + file.getName(), 409);
        } catch (IOException e) {
            throw new ElepyException("Failed to upload file: " + file.getName(), 500, e);
        }
    }

    @Override
    public synchronized Optional<FileUpload> readFile(String name) {
        final Path path = Paths.get(rootFolderLocation + File.separator + decodeFileName(name));
        try {
            final FileUpload fileUpload = FileUpload.of(name, tika.detect(path), Files.newInputStream(path), Files.size(path));
            return Optional.of(fileUpload);
        } catch (NoSuchFileException e) {
            return Optional.empty();
        } catch (IOException e) {
            throw new ElepyException("Failed at retrieving file: " + name, 500);
        }
    }

    @Override
    public List<String> listFiles() {
        final Path path = Paths.get(rootFolderLocation);
        try (Stream<Path> walk = Files.walk(path)) {
            return walk.filter(path1 -> !Files.isDirectory(path1)).map(Path::toString).map(filePath -> filePath.substring(path.toString().length() + 1)).map(this::encodeFileName).collect(Collectors.toList());
        } catch (IOException e) {
            throw new ElepyException("Failed to list all files on Server", 500, e);
        }
    }

    @Override
    public void deleteFile(String encodedFileName) {
        final Path path = Paths.get(rootFolderLocation + File.separator + decodeFileName(encodedFileName));
        try {
            Files.delete(path);
        } catch (IOException e) {
            throw new ElepyException("Failed to delete file: " + encodedFileName, 500);
        }
    }

    private String decodeFileName(String encodedFileName) {
        return replaceNMatches(encodedFileName, "_", File.separator, 2);
    }

    private String encodeFileName(String decodedFileName) {
        return decodedFileName.replaceAll(Matcher.quoteReplacement(File.separator), "_");
    }
}

19 View Complete Implementation : TikaFileTypeDetector.java
Copyright GNU General Public License v2.0
Author : SOBotics
public clreplaced TikaFileTypeDetector extends FileTypeDetector {

    private final Tika tika = new Tika();

    public TikaFileTypeDetector() {
        super();
    }

    @Override
    public String probeContentType(Path path) throws IOException {
        // Try to detect based on the file name only for efficiency
        String fileNameDetect = tika.detect(path.toString());
        if (!fileNameDetect.equals(MimeTypes.OCTET_STREAM)) {
            return fileNameDetect;
        }
        // Then check the file content if necessary
        String fileContentDetect = tika.detect(path);
        if (!fileContentDetect.equals(MimeTypes.OCTET_STREAM)) {
            return fileContentDetect;
        }
        // Specification says to return null if we could not
        // conclusively determine the file type
        return null;
    }
}

19 View Complete Implementation : ProbabilisticMimeDetectionTestWithTika.java
Copyright GNU General Public License v2.0
Author : SOBotics
public clreplaced ProbabilisticMimeDetectionTestWithTika {

    private ProbabilisticMimeDetectionSelector proSelector;

    private MediaTypeRegistry registry;

    private Tika tika;

    /**
     * @inheritDoc
     */
    @Before
    public void setUp() {
        MimeTypes types = MimeTypes.getDefaultMimeTypes();
        ServiceLoader loader = new ServiceLoader();
        registry = types.getMediaTypeRegistry();
        /*
         * here is an example with the use of the builder to
         * instantiate the object.
         */
        Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
        proSelector = new ProbabilisticMimeDetectionSelector(types, builder.priorMagicFileType(0.5f).priorExtensionFileType(0.5f).priorMetaFileType(0.5f));
        DefaultProbDetector detector = new DefaultProbDetector(proSelector, loader);
        // Use a default Tika, except for our different detector
        tika = new Tika(detector);
    }

    @Test
    public void testDetection() throws Exception {
        testFile("image/svg+xml", "circles.svg");
        testFile("image/svg+xml", "circles-with-prefix.svg");
        testFile("image/png", "datamatrix.png");
        testFile("text/html", "test.html");
        testFile("application/xml", "test-iso-8859-1.xml");
        testFile("application/xml", "test-utf8.xml");
        testFile("application/xml", "test-utf8-bom.xml");
        testFile("application/xml", "test-utf16le.xml");
        testFile("application/xml", "test-utf16be.xml");
        testFile("application/xml", "test-long-comment.xml");
        testFile("application/xslt+xml", "stylesheet.xsl");
        testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", "test-difficult-rdf1.xml");
        testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml");
        // add evil test from TIKA-327
        testFile("text/html", "test-tika-327.html");
        // add another evil html test from TIKA-357
        testFile("text/html", "testlargerbuffer.html");
        // test fragment of HTML with <div> (TIKA-1102)
        testFile("text/html", "htmlfragment");
        // test binary CGM detection (TIKA-1170)
        testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
        // test HTML detection of malformed file, previously identified as
        // image/cgm (TIKA-1170)
        testFile("text/html", "test-malformed-header.html.bin");
    }

    @Test
    public void testByteOrderMark() throws Exception {
        replacedertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata()));
        replacedertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata()));
        replacedertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata()));
    }

    @Test
    public void testSuperTypes() {
        replacedertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), MediaType.parse("text/something")));
        replacedertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), MediaType.TEXT_PLAIN));
        replacedertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), MediaType.OCTET_STREAM));
        replacedertTrue(registry.isSpecializationOf(MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
        replacedertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"), MediaType.APPLICATION_XML));
        replacedertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"), MediaType.APPLICATION_ZIP));
        replacedertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN));
        replacedertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"), MediaType.APPLICATION_ZIP));
    }

    @SuppressWarnings("unused")
    private void testUrlOnly(String expected, String url) throws IOException {
        InputStream in = new URL(url).openStream();
        testStream(expected, url, in);
    }

    private void testUrl(String expected, String url, String file) throws IOException {
        InputStream in = getClreplaced().getResourcereplacedtream(file);
        testStream(expected, url, in);
    }

    private void testFile(String expected, String filename) throws IOException {
        InputStream in = getClreplaced().getResourcereplacedtream(filename);
        testStream(expected, filename, in);
    }

    private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException {
        replacedertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
        if (!in.markSupported()) {
            in = new java.io.BufferedInputStream(in);
        }
        try {
            Metadata metadata = new Metadata();
            // String mime = this.proDetector.detect(in, metadata).toString();
            String mime = tika.detect(in, metadata).toString();
            replacedertEquals(urlOrFileName + " is not properly detected: detected.", expected, mime);
            // Add resource name and test again
            metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
            // mime = this.proDetector.detect(in, metadata).toString();
            mime = tika.detect(in, metadata).toString();
            replacedertEquals(urlOrFileName + " is not properly detected after adding resource name.", expected, mime);
        } finally {
            in.close();
        }
    }

    /**
     * Test for type detection of empty doreplacedents.
     *
     * @see <a
     *      href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
     */
    @Test
    public void testEmptyDoreplacedent() throws IOException {
        replacedertEquals(MediaType.OCTET_STREAM.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), new Metadata()));
        Metadata namehint = new Metadata();
        namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
        replacedertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), namehint));
        Metadata typehint = new Metadata();
        typehint.set(Metadata.CONTENT_TYPE, "text/plain");
        replacedertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), typehint));
    }

    /**
     * Test for things like javascript files whose content is enclosed in XML
     * comment delimiters, but that aren't actually XML.
     *
     * @see <a
     *      href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
     */
    @Test
    public void testNotXML() throws IOException {
        replacedertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata()));
    }

    /**
     * Tests that when we repeatedly test the detection of a doreplacedent that can
     * be detected with Mime Magic, that we consistently detect it correctly.
     * See TIKA-391 for more details.
     */
    @Test
    public void testMimeMagicStability() throws IOException {
        for (int i = 0; i < 100; i++) {
            testFile("application/vnd.ms-excel", "test.xls");
        }
    }

    /**
     * Tests that when two magic matches both apply, and both have the same
     * priority, we use the name to pick the right one based on the glob, or the
     * first one we come across if not. See TIKA-1292 for more details.
     */
    @Test
    public void testMimeMagicClashSamePriority() throws IOException {
        byte[] helloWorld = "Hello, World!".getBytes(UTF_8);
        MediaType helloType = MediaType.parse("hello/world-file");
        MediaType helloXType = MediaType.parse("hello/x-world-hello");
        Metadata metadata;
        // With a filename, picks the right one
        metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
        replacedertEquals(helloType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata));
        metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
        replacedertEquals(helloXType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata));
        // Without, goes for the one that sorts last
        metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
        replacedertEquals(helloXType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata));
    }
}

19 View Complete Implementation : Document.java
Copyright GNU General Public License v3.0
Author : jease
public String getText() {
    if (text == null || lastTextUpdate < getFile().lastModified()) {
        try {
            Tika tika = new Tika();
            tika.setMaxStringLength(-1);
            text = tika.parseToString(getFile()).replaceAll("\n\\s*\n+", "\n\n");
        } catch (TikaException e) {
            text = "";
        } catch (IOException e) {
            text = "";
        } finally {
            lastTextUpdate = getFile().lastModified();
        }
    }
    return text;
}

19 View Complete Implementation : InterruptableParsingExample.java
Copyright GNU General Public License v2.0
Author : SOBotics
/**
 * This example demonstrates how to interrupt doreplacedent parsing if
 * some condition is met.
 * <p>
 * {@link InterruptingContentHandler} throws special exception as soon as
 * find {@code query} string in parsed file.
 *
 * See also http://stackoverflow.com/questions/31939851
 */
public clreplaced InterruptableParsingExample {

    // for default autodetect parser
    private Tika tika = new Tika();

    public boolean findInFile(String query, Path path) {
        InterruptingContentHandler handler = new InterruptingContentHandler(query);
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        context.set(Parser.clreplaced, tika.getParser());
        try (InputStream is = new BufferedInputStream(Files.newInputStream(path))) {
            tika.getParser().parse(is, handler, metadata, context);
        } catch (QueryMatchedException e) {
            return true;
        } catch (SAXException | TikaException | IOException e) {
            // something went wrong with parsing...
            e.printStackTrace();
        }
        return false;
    }

    clreplaced QueryMatchedException extends SAXException {
    }

    /**
     * Trivial content handler that searched for {@code query} in characters send to it.
     * <p>
     * Throws {@link QueryMatchedException} when query string is found.
     */
    clreplaced InterruptingContentHandler extends DefaultHandler {

        private String query;

        private StringBuilder sb = new StringBuilder();

        InterruptingContentHandler(String query) {
            this.query = query;
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            sb.append(new String(ch, start, length).toLowerCase(Locale.getDefault()));
            if (sb.toString().contains(query))
                throw new QueryMatchedException();
            if (sb.length() > 2 * query.length())
                // keep tail with query.length() chars
                sb.delete(0, sb.length() - query.length());
        }
    }
}

19 View Complete Implementation : ClientRequest.java
Copyright Apache License 2.0
Author : Elopteryx
/**
 * Utility clreplaced for making multipart requests.
 */
public final clreplaced ClientRequest {

    static final String BOUNDARY = "--TNoK9riv6EjfMhxBzj22SKGnOaIhZlxhar";

    static final String SIMPLE = "simple";

    static final String THRESHOLD_LESSER = "threshold_lesser";

    static final String THRESHOLD_GREATER = "threshold_greater";

    static final String ERROR = "error";

    static final String IO_ERROR_UPON_ERROR = "io_error_upon_error";

    static final String SERVLET_ERROR_UPON_ERROR = "servlet_error_upon_error";

    static final String COMPLEX = "complex";

    static final FileSystem FILE_SYSTEM = Jimfs.newFileSystem();

    static final Tika TIKA = new Tika();

    /**
     * Creates and sends a randomized multipart request for the
     * given address.
     * @param url The target address
     * @param expectedStatus The expected HTTP response, can be null
     * @throws IOException If an IO error occurred
     */
    public static void performRequest(final String url, final Integer expectedStatus) throws IOException {
        performRequest(url, expectedStatus, withSeveralFields());
    }

    /**
     * Creates and sends a randomized multipart request for the
     * given address.
     * @param url The target address
     * @param expectedStatus The expected HTTP response, can be null
     * @param requestData The multipart body, can't be null
     * @throws IOException If an IO error occurred
     */
    public static void performRequest(final String url, final Integer expectedStatus, final ByteBuffer requestData) throws IOException {
        final var client = HttpClient.newBuilder().version(HTTP_1_1).build();
        final var request = HttpRequest.newBuilder().uri(URI.create(url)).timeout(Duration.ofSeconds(5)).header("Content-Type", "multipart/form-data; boundary=" + BOUNDARY).POST(HttpRequest.BodyPublishers.ofByteArray(requestData.array(), 0, requestData.limit())).build();
        try {
            client.send(request, responseInfo -> {
                final var statusCode = responseInfo.statusCode();
                System.out.println("----------------------------------------");
                System.out.println(statusCode);
                if (expectedStatus != null) {
                    replacedertEquals((int) expectedStatus, statusCode);
                }
                return HttpResponse.BodySubscribers.ofString(StandardCharsets.UTF_8);
            });
        } catch (final InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
}

19 View Complete Implementation : TikaConfiguration.java
Copyright GNU Affero General Public License v3.0
Author : dzhw
@Bean
public MimeTypeDetector mimeTypeDetector(Tika tika) {
    return new MimeTypeDetector(tika);
}

19 View Complete Implementation : MultipartFileUtils.java
Copyright Apache License 2.0
Author : reportportal
/**
 * @author <a href="mailto:[email protected]">Ihar Kahadouski</a>
 */
public clreplaced MultipartFileUtils {

    private static Tika tika = new Tika();

    private MultipartFileUtils() {
    // static only
    }

    public static CommonsMultipartFile getMultipartFile(String path) throws IOException {
        ClreplacedPathResource resource = new ClreplacedPathResource(path);
        InputStream bufferedInputStream = new BufferedInputStream(resource.getInputStream());
        FileItem fileItem = new DiskFileItem("mainFile", tika.detect(bufferedInputStream), false, resource.getFilename(), bufferedInputStream.available(), null);
        IOUtils.copy(bufferedInputStream, fileItem.getOutputStream());
        return new CommonsMultipartFile(fileItem);
    }
}

19 View Complete Implementation : MediaTypeValidator.java
Copyright Apache License 2.0
Author : ibissource
/**
 * Specific clreplaced to detect media type used by CisConversionServiceImpl
 */
clreplaced MediaTypeValidator {

    private Tika tika;

    private String pdfOutputlocation;

    /**
     * Package default access because it specific for the conversion.
     */
    public MediaTypeValidator(String pdfOutputlocation) {
        // Create only once. Tika seems to be thread safe
        // (see
        // http://stackoverflow.com/questions/10190980/spring-tika-integration-is-my-approach-thread-safe)
        tika = new Tika();
        this.pdfOutputlocation = pdfOutputlocation;
    }

    /**
     * Detects media type from input stream
     *
     * @param inputStream
     * @param filename
     * @return
     * @throws IOException
     */
    public MediaType getMediaType(InputStream inputStream, String filename) throws IOException {
        // Create every time as TemporaryResources is not thread-safe
        TemporaryResources tmp = new TemporaryResources();
        tmp.setTemporaryFileDirectory(Paths.get(pdfOutputlocation));
        try (TikaInputStream tis = TikaInputStream.get(inputStream, tmp)) {
            String type = tika.detect(tis, filename);
            return MediaType.parse(type);
        }
    }
}

19 View Complete Implementation : SimpleTikaBuilder.java
Copyright Apache License 2.0
Author : groupe-sii
/**
 * Instanreplacedes and configures the {@link TikaProvider}:
 * <ul>
 * <li>If a custom {@link Tika} instance has been provided, then use it
 * directly</li>
 * <li>If no custom {@link Tika} instance has been provided, then use the
 * default one: {@code new Tika()} (see
 * {@link TikaConfig#getDefaultConfig()})</li>
 * <li>Tika may be in some conditions not enough accurate. In this case, it will
 * return application/octet-stream mimetype. If
 * {@link #failIfOctetStream(boolean)} is set to true, then if Tika returns an
 * application/octet-stream, it will throw an exception. The purpose is to let
 * another {@link MimeTypeProvider} implementation take over and try to make a
 * better detection.</li>
 * </ul>
 *
 * @author Aurélien Baudet
 *
 * @param <P>
 *            the type of the parent builder (when calling {@link #and()}
 *            method)
 */
public clreplaced SimpleTikaBuilder<P> extends AbstractParent<P> implements TikaBuilder<P> {

    private Tika tika;

    private boolean failIfOctetStream = true;

    /**
     * The parent builder (it is used when calling {@link #and()} method).
     *
     * @param parent
     *            the parent builder
     */
    public SimpleTikaBuilder(P parent) {
        super(parent);
    }

    @Override
    public TikaBuilder<P> instance(Tika tika) {
        this.tika = tika;
        return this;
    }

    @Override
    public TikaBuilder<P> failIfOctetStream(boolean fail) {
        failIfOctetStream = fail;
        return this;
    }

    @Override
    public MimeTypeProvider build() {
        Tika tikaInstance = this.tika == null ? new Tika() : this.tika;
        return new TikaProvider(tikaInstance, failIfOctetStream);
    }
}

19 View Complete Implementation : TikaVersion.java
Copyright GNU General Public License v2.0
Author : SOBotics
@Path("/version")
public clreplaced TikaVersion {

    private Tika tika;

    public TikaVersion() {
        this.tika = new Tika(TikaResource.getConfig());
    }

    @GET
    @Produces("text/plain")
    public String getVersion() {
        return tika.toString();
    }
}

19 View Complete Implementation : CodeApp.java
Copyright Apache License 2.0
Author : naver
@AnonymousCheck
public clreplaced CodeApp extends Controller {

    public static String hostName;

    @IsAllowed(Operation.READ)
    public static Result codeBrowser(String userName, String projectName) throws IOException, UnsupportedOperationException, ServletException {
        Project project = Project.findByOwnerAndProjectName(userName, projectName);
        if (!RepositoryService.VCS_GIT.equals(project.vcs) && !RepositoryService.VCS_SUBVERSION.equals(project.vcs)) {
            return status(Http.Status.NOT_IMPLEMENTED, project.vcs + " is not supported!");
        }
        PlayRepository repository = RepositoryService.getRepository(project);
        if (repository.isEmpty()) {
            switch(project.vcs) {
                case RepositoryService.VCS_GIT:
                    return ok(nohead.render(project));
                case RepositoryService.VCS_SUBVERSION:
                    return ok(nohead_svn.render(project));
            }
        }
        String defaultBranch = project.defaultBranch();
        if (defaultBranch == null) {
            defaultBranch = "HEAD";
        } else if (defaultBranch.split("/").length >= 3) {
            defaultBranch = defaultBranch.split("/", 3)[2];
        }
        defaultBranch = URLEncoder.encode(defaultBranch, "UTF-8");
        return redirect(routes.CodeApp.codeBrowserWithBranch(userName, projectName, defaultBranch, ""));
    }

    @With(DefaultProjectCheckAction.clreplaced)
    public static Result codeBrowserWithBranch(String userName, String projectName, String branch, String path) throws UnsupportedOperationException, IOException, SVNException, GitAPIException, ServletException {
        Project project = Project.findByOwnerAndProjectName(userName, projectName);
        if (!RepositoryService.VCS_GIT.equals(project.vcs) && !RepositoryService.VCS_SUBVERSION.equals(project.vcs)) {
            return status(Http.Status.NOT_IMPLEMENTED, project.vcs + " is not supported!");
        }
        branch = HttpUtil.decodePathSegment(branch);
        path = HttpUtil.decodePathSegment(path);
        PlayRepository repository = RepositoryService.getRepository(project);
        List<String> branches = repository.getRefNames();
        List<ObjectNode> recursiveData = RepositoryService.getMetaDataFromAncestorDirectories(repository, branch, path);
        if (recursiveData == null) {
            return notFound(ErrorViews.NotFound.render());
        }
        return ok(view.render(project, branches, recursiveData, branch, path));
    }

    @With(DefaultProjectCheckAction.clreplaced)
    public static Result ajaxRequest(String userName, String projectName, String path) throws Exception {
        PlayRepository repository = RepositoryService.getRepository(userName, projectName);
        path = HttpUtil.decodePathSegment(path);
        ObjectNode fileInfo = repository.getMetaDataFromPath(path);
        if (fileInfo != null) {
            return ok(fileInfo);
        } else {
            return notFound();
        }
    }

    @With(DefaultProjectCheckAction.clreplaced)
    public static Result ajaxRequestWithBranch(String userName, String projectName, String branch, String path) throws UnsupportedOperationException, IOException, SVNException, GitAPIException, ServletException {
        CodeApp.hostName = request().host();
        PlayRepository repository = RepositoryService.getRepository(userName, projectName);
        branch = HttpUtil.decodePathSegment(branch);
        path = HttpUtil.decodePathSegment(path);
        ObjectNode fileInfo = repository.getMetaDataFromPath(branch, path);
        if (fileInfo != null) {
            return ok(fileInfo);
        } else {
            return notFound();
        }
    }

    @With(DefaultProjectCheckAction.clreplaced)
    public static Result showRawFile(String userName, String projectName, String revision, String path) throws Exception {
        path = HttpUtil.decodePathSegment(path);
        revision = HttpUtil.decodePathSegment(revision);
        byte[] fileAsRaw = RepositoryService.getFileAsRaw(userName, projectName, revision, path);
        if (fileAsRaw == null) {
            return redirect(routes.CodeApp.codeBrowserWithBranch(userName, projectName, revision, path));
        }
        MediaType mediaType = FileUtil.detectMediaType(fileAsRaw, FilenameUtils.getName(path));
        String mediaTypeString = "text/plain";
        String charset = FileUtil.getCharset(mediaType);
        if (charset != null) {
            mediaTypeString += "; charset=" + charset;
        }
        return ok(fileAsRaw).as(mediaTypeString);
    }

    @With(DefaultProjectCheckAction.clreplaced)
    public static Result showImageFile(String userName, String projectName, String revision, String path) throws Exception {
        revision = HttpUtil.decodePathSegment(revision);
        path = HttpUtil.decodePathSegment(path);
        final byte[] fileAsRaw = RepositoryService.getFileAsRaw(userName, projectName, revision, path);
        String mimeType = tika.detect(fileAsRaw);
        return ok(fileAsRaw).as(mimeType);
    }

    private static Tika tika = new Tika();

    public static String getURL(String ownerName, String projectName) {
        Project project = Project.findByOwnerAndProjectName(ownerName, projectName);
        return getURL(project);
    }

    public static String getURL(Project project) {
        if (project == null) {
            return null;
        } else if (RepositoryService.VCS_GIT.equals(project.vcs)) {
            return utils.Url.createWithContext(Arrays.asList(project.owner, project.name));
        } else if (RepositoryService.VCS_SUBVERSION.equals(project.vcs)) {
            return utils.Url.createWithContext(Arrays.asList("svn", project.owner, project.name));
        } else {
            return null;
        }
    }

    public static String getURLWithLoginId(Project project) {
        String url = getURL(project);
        if (url != null) {
            String loginId = session().get(UserApp.SESSION_LOGINID);
            if (loginId != null && !loginId.isEmpty()) {
                url = url.replace("://", "://" + loginId + "@");
            }
        }
        return url;
    }

    @IsAllowed(Operation.READ)
    public static Result openFile(String userName, String projectName, String revision, String path) throws Exception {
        revision = HttpUtil.decodePathSegment(revision);
        path = HttpUtil.decodePathSegment(path);
        byte[] raw = RepositoryService.getFileAsRaw(userName, projectName, revision, path);
        if (raw == null) {
            return notFound(ErrorViews.NotFound.render("error.notfound"));
        }
        return ok(raw).as(FileUtil.detectMediaType(raw, FilenameUtils.getName(path)).toString());
    }
}

19 View Complete Implementation : Helpers.java
Copyright Apache License 2.0
Author : adamkewley
public static String getMimeType(InputStream s, String fileName) throws IOException {
    final Tika t = new Tika();
    return t.detect(s, fileName);
}

19 View Complete Implementation : OOXMLContainerExtractionTest.java
Copyright GNU General Public License v2.0
Author : SOBotics
@Before
public void setUp() {
    Tika tika = new Tika();
    extractor = new ParserContainerExtractor(tika.getParser(), tika.getDetector());
}

19 View Complete Implementation : MimeTypeDetector.java
Copyright GNU Affero General Public License v3.0
Author : dzhw
/**
 * Detect the mimetype of files using Apache Tika.
 *
 * @author René Reitmann
 */
public clreplaced MimeTypeDetector {

    private Tika tika;

    public MimeTypeDetector(Tika tika) {
        this.tika = tika;
    }

    /**
     * Detect the mime type of a {@link MultipartFile}.
     * @param multipartFile An uploaded File.
     * @return the mime type of the file
     * @throws IOException if the uploaded file cannot be read
     */
    @SuppressFBWarnings("RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
    public String detect(MultipartFile multipartFile) throws IOException {
        try (InputStream is = multipartFile.getInputStream()) {
            return tika.detect(is);
        }
    }
}

19 View Complete Implementation : GeneralFileHandlerServiceImpl.java
Copyright Apache License 2.0
Author : sastix
@Service
public clreplaced GeneralFileHandlerServiceImpl implements GeneralFileHandlerService {

    private final Tika tika = new Tika();

    @Override
    public Charset guessCharset(InputStream is) throws IOException {
        return Charset.forName(tika.detect(is));
    }

    @Override
    public String getMediaType(byte[] bytes) throws IOException {
        String mimeType = tika.detect(bytes);
        return mimeType;
    }

    @Override
    public String findParentFile(String xml) {
        String ret = null;
        Doreplacedent doc = Jsoup.parse(xml, "", Parser.xmlParser());
        for (Element e : doc.select("resources")) {
            ret = e.select("resource").get(0).attr("href");
        }
        return ret;
    }

    @Override
    public void replaceRelativePathsInWebFiles(File file, Map<String, String> paths) {
    }
}

18 View Complete Implementation : TikaImpl.java
Copyright Apache License 2.0
Author : elastic
/**
 * do NOT make public
 */
final clreplaced TikaImpl {

    /**
     * subset of parsers for types we support
     */
    private static final Parser[] PARSERS = new Parser[] { // doreplacedents
    new org.apache.tika.parser.html.HtmlParser(), new org.apache.tika.parser.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), new org.apache.tika.parser.odf.OpenDoreplacedentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser() };

    /**
     * autodetector based on this subset
     */
    private static final AutoDetectParser PARSER_INSTANCE = new AutoDetectParser(PARSERS);

    /**
     * singleton tika instance
     */
    private static final Tika TIKA_INSTANCE = new Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE);

    /**
     * parses with tika, throwing any exception hit while parsing the doreplacedent
     */
    // only package private for testing!
    static String parse(final byte[] content, final Metadata metadata, final int limit) throws TikaException, IOException {
        // check that its not unprivileged code like a script
        SecurityManager sm = System.getSecurityManager();
        if (sm != null) {
            sm.checkPermission(new SpecialPermission());
        }
        try {
            return AccessController.doPrivileged(new PrivilegedExceptionAction<String>() {

                @Override
                public String run() throws TikaException, IOException {
                    return TIKA_INSTANCE.parseToString(StreamInput.wrap(content), metadata, limit);
                }
            });
        } catch (PrivilegedActionException e) {
            // checked exception from tika: unbox it
            Throwable cause = e.getCause();
            if (cause instanceof TikaException) {
                throw (TikaException) cause;
            } else if (cause instanceof IOException) {
                throw (IOException) cause;
            } else {
                throw new replacedertionError(cause);
            }
        }
    }
}

18 View Complete Implementation : CMSThemeFile.java
Copyright GNU Lesser General Public License v3.0
Author : FenixEdu
public clreplaced CMSThemeFile {

    private static final Tika tika = new Tika();

    private final String fullPath;

    private final byte[] content;

    private final String contentType;

    private final String fileName;

    private final DateTime lastModified;

    public CMSThemeFile(String filename, String fullPath, byte[] content) {
        this.fileName = filename;
        this.fullPath = fullPath;
        this.content = content;
        this.contentType = tika.detect(content, filename);
        this.lastModified = DateTime.now();
    }

    CMSThemeFile(JsonObject json) {
        this.fileName = json.get("fileName").getreplacedtring();
        this.fullPath = json.get("fullPath").getreplacedtring();
        this.contentType = json.get("contentType").getreplacedtring();
        this.content = Base64.getDecoder().decode(json.get("content").getreplacedtring());
        this.lastModified = new DateTime(json.get("lastModified").getAsLong());
    }

    public String getFullPath() {
        return fullPath;
    }

    public byte[] getContent() {
        return content;
    }

    public String getContentType() {
        return contentType;
    }

    public String getFileName() {
        return fileName;
    }

    public long getFileSize() {
        return content.length;
    }

    public DateTime getLastModified() {
        return lastModified;
    }

    public JsonElement toJson() {
        JsonObject json = new JsonObject();
        json.addProperty("fileName", fileName);
        json.addProperty("fullPath", fullPath);
        json.addProperty("contentType", contentType);
        json.addProperty("content", Base64.getEncoder().encodeToString(content));
        json.addProperty("lastModified", lastModified.getMillis());
        return json;
    }
}

18 View Complete Implementation : SimpleTikaBuilder.java
Copyright Apache License 2.0
Author : groupe-sii
@Override
public TikaBuilder<P> instance(Tika tika) {
    this.tika = tika;
    return this;
}

18 View Complete Implementation : SimpleTikaBuilder.java
Copyright Apache License 2.0
Author : groupe-sii
@Override
public MimeTypeProvider build() {
    Tika tikaInstance = this.tika == null ? new Tika() : this.tika;
    return new TikaProvider(tikaInstance, failIfOctetStream);
}

18 View Complete Implementation : TikaBuilderDelegate.java
Copyright Apache License 2.0
Author : groupe-sii
@Override
public TikaBuilderDelegate<P> instance(Tika tika) {
    delegate.instance(tika);
    return this;
}

18 View Complete Implementation : TikaProvider.java
Copyright Apache License 2.0
Author : groupe-sii
/**
 * Mime Type detection based on <a href="http://tika.apache.org/">Apache
 * Tika</a>. This library is very complete and up-to-date. It is also able to
 * parse doreplacedents but this is not useful in our case.
 *
 * @author Aurélien Baudet
 * @see <a href="http://tika.apache.org/">Apache Tika</a>
 */
public clreplaced TikaProvider implements MimeTypeProvider {

    private static final Logger LOG = LoggerFactory.getLogger(TikaProvider.clreplaced);

    /**
     * The Tika instance to use
     */
    private final Tika tika;

    /**
     * Whether to fail if the default mimetype is return (this may indicate that
     * detection hasn't work).
     */
    private final boolean failIfOctetStream;

    /**
     * Initialize the provider with default Tika instance and configuration. It
     * fails if application/octet-stream mimetype is returned
     */
    public TikaProvider() {
        this(new Tika(), true);
    }

    /**
     * Initialize the provider with the specified Tika instance.
     *
     * @param tika
     *            the Tika instance to use
     * @param failIfOctetStream
     *            Whether to fail if the default mimetype is return (this may
     *            indicate that detection hasn't work).
     */
    public TikaProvider(Tika tika, boolean failIfOctetStream) {
        super();
        this.tika = tika;
        this.failIfOctetStream = failIfOctetStream;
    }

    @Override
    public MimeType getMimeType(File file) throws MimeTypeDetectionException {
        try {
            LOG.debug("Detect mime type for file {}", file);
            String mimetype = tika.detect(file);
            LOG.debug("Detect mime type for file {}: {}", file, mimetype);
            checkMimeType(mimetype);
            return new MimeType(mimetype);
        } catch (MimeTypeParseException e) {
            throw new MimeTypeDetectionException("Invalid mimetype", e);
        } catch (IOException e) {
            throw new MimeTypeDetectionException("Failed to get the mimetype for the file " + file, e);
        }
    }

    @Override
    public MimeType getMimeType(String fileName) throws MimeTypeDetectionException {
        return getMimeType(new File(fileName));
    }

    @Override
    public MimeType detect(InputStream stream) throws MimeTypeDetectionException {
        try {
            LOG.debug("Detect mime type from stream");
            String mimetype = tika.detect(stream);
            LOG.debug("Detect mime type from stream: {}", mimetype);
            checkMimeType(mimetype);
            return new MimeType(mimetype);
        } catch (MimeTypeParseException e) {
            throw new MimeTypeDetectionException("Invalid mimetype", e);
        } catch (IOException e) {
            throw new MimeTypeDetectionException("Failed to get the mimetype because the stream is not readable", e);
        }
    }

    @Override
    public MimeType detect(String content) throws MimeTypeDetectionException {
        try {
            LOG.debug("Detect mime type from stream");
            String mimetype = tika.detect(content.getBytes());
            LOG.debug("Detect mime type from stream: {}", mimetype);
            checkMimeType(mimetype);
            return new MimeType(mimetype);
        } catch (MimeTypeParseException e) {
            throw new MimeTypeDetectionException("Invalid mimetype", e);
        }
    }

    @Override
    public String toString() {
        StringBuilder builder = new StringBuilder();
        builder.append("TikaProvider [tika=").append(tika.toString()).append("]");
        return builder.toString();
    }

    private void checkMimeType(String mimetype) throws MimeTypeDetectionException {
        if (failIfOctetStream && MediaType.OCTET_STREAM.toString().equals(mimetype)) {
            throw new MimeTypeDetectionException("Default mimetype found (application/octet-stream) but provider is configured to fail in this case");
        }
    }
}

18 View Complete Implementation : MyMimeTypeUtils.java
Copyright Apache License 2.0
Author : h819
/**
 * 利用 Tika 分析 Mime Type
 * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。
 * Parses the resource at the given URL and returns the extracted text content.
 *
 * @param url
 * @return
 */
public static String detect(URL url, int timeout) throws Exception {
    // 网址不存在
    if (!MyUrlUtils.isURLAvailable(url, timeout)) {
        throw new Exception("exception ! " + url.getAuthority() + " not available");
    }
    Tika t = new Tika();
    return t.detect(url);
}

18 View Complete Implementation : MyMimeTypeUtils.java
Copyright Apache License 2.0
Author : h819
/**
 *  Tika 类中还有一些判断方法
 *
 *  因为 Tika 要解析 File, URL 数据流,所以解析需要一定时间。
 *
 *  对于已知扩展名的,用 String 方法,不知道的,用 File, URL 方法
 */
/**
 * ======================================================================
 */
/**
 * 根据扩展名判断 mime type ,如果无扩展名,则用 File 或 URL 判断
 * <p/>
 * Detects the media type of a doreplacedent with the given file name. The type detection is based on known file name extensions.
 * The given name can also be a URL or a full file path. In such cases only the file name parreplacedion of the string is used for type detection.
 *
 * @param name the file name of the doreplacedent
 * @return detected media type
 */
/**
 * 利用 Tika 分析 Mime Type
 * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。
 * <p>
 * Parses the given file and returns the extracted text content.
 *
 * @param file
 * @return
 */
public static String detect(File file) throws Exception {
    // 文件不存在
    if (!file.exists()) {
        throw new Exception("exception ! " + file.getAbsoluteFile() + " not existes.");
    }
    Tika t = new Tika();
    return t.detect(file);
}

18 View Complete Implementation : IOUtil.java
Copyright GNU Lesser General Public License v2.1
Author : lucee
/**
 * return the mime type of a file, dont check extension
 *
 * @param barr
 * @return mime type of the file
 * @throws IOException
 */
public static String getMimeType(byte[] barr, String defaultValue) {
    try {
        Tika tika = new Tika();
        return tika.detect(barr);
    } catch (Throwable t) {
        ExceptionUtil.rethrowIfNecessary(t);
        return defaultValue;
    }
}

18 View Complete Implementation : IOUtil.java
Copyright GNU Lesser General Public License v2.1
Author : lucee
public static String getMimeType(URL url, String defaultValue) {
    try {
        Tika tika = new Tika();
        return tika.detect(url);
    } catch (Exception e) {
        return defaultValue;
    }
}

18 View Complete Implementation : FileDownloadView.java
Copyright Apache License 2.0
Author : miyabayt
/**
 * FileDownloadビュー
 */
public clreplaced FileDownloadView extends AbstractView {

    private int chunkSize = 256;

    private Resource resource;

    @Setter
    private boolean isAttachment = true;

    @Setter
    protected String filename;

    protected static final Tika TIKA = new Tika();

    /**
     * コンストラクタ
     */
    public FileDownloadView(Resource resource) {
        this(resource, 256);
    }

    /**
     * コンストラクタ
     */
    public FileDownloadView(Resource resource, int chunkSize) {
        this.resource = resource;
        this.chunkSize = chunkSize;
    }

    @Override
    protected final void renderMergedOutputModel(Map<String, Object> model, HttpServletRequest request, HttpServletResponse response) throws Exception {
        try (InputStream inputStream = resource.getInputStream();
            OutputStream outputStream = response.getOutputStream()) {
            val file = resource.getFile();
            val detectedContentType = TIKA.detect(file);
            val mediaType = MediaType.parseMediaType(detectedContentType);
            val inlineOrAttachment = (isAttachment) ? "attachment" : "inline";
            val contentDisposition = String.format("%s; filename=\"%s\"", inlineOrAttachment, filename);
            response.setHeader(CONTENT_TYPE, mediaType.toString());
            response.setHeader(CONTENT_DISPOSITION, contentDisposition);
            byte[] buffer = new byte[chunkSize];
            int length;
            while ((length = inputStream.read(buffer)) > 0) {
                outputStream.write(buffer, 0, length);
            }
            outputStream.flush();
        } catch (IOException e) {
            throw new IllegalArgumentException(e);
        }
    }
}

18 View Complete Implementation : ArtifactPost.java
Copyright Apache License 2.0
Author : Open-MBEE
public clreplaced ArtifactPost extends AbstractJavaWebScript {

    static Logger logger = Logger.getLogger(ArtifactPost.clreplaced);

    protected EmsScriptNode artifact = null;

    protected String filename = null;

    protected String finalContentType = null;

    protected String artifactId = null;

    protected String extension = null;

    protected String content = null;

    protected String siteName = null;

    protected EmsScriptNode workspace = null;

    protected Path filePath = null;

    protected String mimeType = null;

    protected String encoding = null;

    protected Tika tika = new Tika();

    protected TikaConfig tikaConfig = TikaConfig.getDefaultConfig();

    private final String NEWELEMENTS = "newElements";

    public ArtifactPost() {
        super();
    }

    public ArtifactPost(Repository repositoryHelper, ServiceRegistry registry) {
        super(repositoryHelper, registry);
    }

    /**
     * Entry point
     */
    @Override
    protected Map<String, Object> executeImpl(WebScriptRequest req, Status status, Cache cache) {
        ArtifactPost instance = new ArtifactPost(repository, services);
        instance.setServices(getServices());
        return instance.executeImplImpl(req, status, cache);
    }

    @Override
    protected Map<String, Object> executeImplImpl(final WebScriptRequest req, final Status status, Cache cache) {
        String user = AuthenticationUtil.getFullyAuthenticatedUser();
        printHeader(user, logger, req, true);
        Timer timer = new Timer();
        Map<String, Object> result = new HashMap<>();
        JsonObject postJson = new JsonObject();
        FormData formData = (FormData) req.parseContent();
        FormData.FormField[] fields = formData.getFields();
        try {
            for (FormData.FormField field : fields) {
                if (logger.isDebugEnabled()) {
                    logger.debug("field.getName(): " + field.getName());
                }
                if (field.getName().equals("file") && field.getIsFile()) {
                    // String extension = FilenameUtils.getExtension();
                    // String filenameString = field.getFilename().substring(0, field.getFilename().lastIndexOf('.') - 1);
                    filename = field.getFilename().replaceAll("[^a-zA-Z0-9.-]", "_");
                    Content tempContent = field.getContent();
                    mimeType = tempContent.getMimetype();
                    encoding = tempContent.getEncoding();
                    filePath = EmsNodeUtil.saveToFilesystem(filename, field.getInputStream());
                    content = new String(Files.readAllBytes(filePath));
                    finalContentType = tika.detect(filePath);
                    if (logger.isDebugEnabled()) {
                        logger.debug("filename: " + filename);
                        logger.debug("mimetype: " + mimeType);
                        logger.debug("finalMimetype: " + finalContentType);
                        logger.debug("encoding: " + encoding);
                        logger.debug("content: " + content);
                    }
                } else {
                    String name = field.getName();
                    String value = field.getValue();
                    postJson.addProperty(name, value);
                    if (logger.isDebugEnabled()) {
                        logger.debug("property name: " + name);
                    }
                }
            }
            postJson.addProperty(Sjm.TYPE, "Artifact");
        } catch (Exception e) {
            logger.error(String.format("%s", LogUtil.getStackTrace(e)));
        } catch (Throwable t) {
            logger.error(String.format("%s", LogUtil.getStackTrace(t)));
        }
        // Would ideally be a transaction, :TODO the image has to be successfully posted before the json is post to the db
        // maybe processArtifactDelta needs to be called from handleArtifactPost
        if (handleArtifactPost(req, status, user, postJson)) {
            result = processArtifactDelta(req, user, postJson, status);
        }
        printFooter(user, logger, timer);
        return result;
    }

    protected Map<String, Object> processArtifactDelta(final WebScriptRequest req, String user, JsonObject postJson, final Status status) {
        String refId = getRefId(req);
        String projectId = getProjectId(req);
        Map<String, Object> model = new HashMap<>();
        JsonObject newElementsObject = new JsonObject();
        JsonObject results;
        EmsNodeUtil emsNodeUtil = new EmsNodeUtil(projectId, refId);
        try {
            JsonArray delta = new JsonArray();
            postJson.addProperty(Sjm.CHECKSUM, EmsNodeUtil.md5Hash(filePath.toFile()));
            delta.add(postJson);
            this.populateSourceApplicationFromJson(postJson);
            Set<String> oldElasticIds = new HashSet<>();
            results = emsNodeUtil.processPostJson(delta, user, oldElasticIds, false, this.requestSourceApplication, JsonUtil.getOptString(postJson, "comment"), Sjm.ARTIFACT);
            String commitId = results.get("commit").getAsJsonObject().get(Sjm.ELASTICID).getreplacedtring();
            if (CommitUtil.sendDeltas(results, projectId, refId, requestSourceApplication, services, false, true)) {
                if (!oldElasticIds.isEmpty()) {
                    emsNodeUtil.updateElasticRemoveRefs(oldElasticIds, "artifact");
                }
                Map<String, String> commitObject = emsNodeUtil.getGuidAndTimestampFromElasticId(commitId);
                newElementsObject.add(Sjm.ARTIFACTS, filterByPermission(results.get(NEWELEMENTS).getAsJsonArray(), req));
                newElementsObject.addProperty(Sjm.COMMITID, commitId);
                newElementsObject.addProperty(Sjm.TIMESTAMP, commitObject.get(Sjm.TIMESTAMP));
                newElementsObject.addProperty(Sjm.CREATOR, user);
                if (prettyPrint) {
                    Gson gson = new GsonBuilder().setPrettyPrinting().create();
                    model.put(Sjm.RES, gson.toJson(newElementsObject));
                } else {
                    model.put(Sjm.RES, newElementsObject);
                }
                status.setCode(responseStatus.getCode());
            } else {
                log(Level.ERROR, HttpServletResponse.SC_BAD_REQUEST, "Commit failed, please check server logs for failed items");
                model.put(Sjm.RES, createResponseJson());
            }
            return model;
        } catch (Exception e) {
            logger.error(String.format("%s", LogUtil.getStackTrace(e)));
        }
        return model;
    }

    boolean handleArtifactPost(final WebScriptRequest req, final Status status, String user, JsonObject postJson) {
        JsonObject resultJson = null;
        Map<String, Object> model = new HashMap<>();
        // Replace with true content type
        if (finalContentType != null) {
            postJson.addProperty(Sjm.CONTENTTYPE, finalContentType);
        }
        String projectId = getProjectId(req);
        String refId = getRefId(req);
        EmsNodeUtil emsNodeUtil = new EmsNodeUtil(projectId, refId);
        JsonObject project = emsNodeUtil.getProject(projectId);
        siteName = JsonUtil.getOptString(project, "orgId");
        if (validateRequest(req, status) && !siteName.isEmpty()) {
            try {
                extension = tikaConfig.getMimeRepository().forName(JsonUtil.getOptString(postJson, Sjm.CONTENTTYPE)).getExtension();
            } catch (MimeTypeException mte) {
                logger.debug(mte);
            }
            if (extension == null) {
                extension = FilenameUtils.getExtension(filename);
            }
            artifactId = postJson.get(Sjm.SYSMLID).getreplacedtring();
            // Create return json:
            resultJson = new JsonObject();
            resultJson.addProperty("filename", filename);
            // TODO: want full path here w/ path to site also, but Doris does not use it,
            // so leaving it as is.
            // resultJson.put("path", path);
            // resultJson.put("site", siteName);
            // Update or create the artifact if possible:
            if (!Utils.isNullOrEmpty(artifactId) && !Utils.isNullOrEmpty(content)) {
                String alfrescoId = artifactId + System.currentTimeMillis() + "." + extension;
                // :TODO check against checksum first, md5hash(content), if matching return the previous version
                if (filePath != null) {
                    artifact = EmsScriptNode.updateOrCreateArtifact(alfrescoId, filePath, JsonUtil.getOptString(postJson, Sjm.CONTENTTYPE), siteName, projectId, refId);
                }
                if (artifact == null) {
                    log(Level.ERROR, HttpServletResponse.SC_BAD_REQUEST, "Was not able to create the artifact!\n");
                    model.put(Sjm.RES, createResponseJson());
                } else {
                    String url = artifact.getUrl();
                    if (url != null) {
                        postJson.addProperty(Sjm.ARTIFACTLOCATION, url.replace("/d/d/", "/service/api/node/content/"));
                    }
                }
            } else {
                log(Level.INFO, HttpServletResponse.SC_BAD_REQUEST, "artifactId not supplied or content is empty!");
                model.put(Sjm.RES, createResponseJson());
            }
        } else {
            log(Level.INFO, HttpServletResponse.SC_BAD_REQUEST, "Invalid request, no sitename specified or no content provided!");
            model.put(Sjm.RES, createResponseJson());
        }
        status.setCode(responseStatus.getCode());
        if (!model.containsKey(Sjm.RES)) {
            model.put(Sjm.RES, resultJson != null ? resultJson : createResponseJson());
        }
        return true;
    }

    @Override
    protected boolean validateRequest(WebScriptRequest req, Status status) {
        String elementId = req.getServiceMatch().getTemplateVars().get("elementid");
        if (elementId != null && !checkRequestVariable(elementId, "elementid")) {
            return false;
        }
        return checkRequestContent(req);
    }
}

18 View Complete Implementation : TikaDemo.java
Copyright MIT License
Author : PacktPublishing
public static void main(String[] args) {
    Tika tika = new Tika();
    try {
        File file = new File(getResourcePath());
        String filetype = tika.detect(file);
        System.out.println(filetype);
        System.out.println(tika.parseToString(file));
    } catch (IOException ex) {
        Logger.getLogger(TikaDemo.clreplaced.getName()).log(Level.SEVERE, null, ex);
    } catch (TikaException ex) {
        Logger.getLogger(TikaDemo.clreplaced.getName()).log(Level.SEVERE, null, ex);
    }
}

18 View Complete Implementation : TempResourceAction.java
Copyright GNU Affero General Public License v3.0
Author : phenotips
/**
 * Action responsible for downloading temporary resources created by various modules. The temporary resource is put in
 * the temporary directory in a directory named "temp" and in subdirectories "(module)/(wiki)/(space)/(page)/(file)"
 * where:
 * <ul>
 * <li>(module): it's the 3rd path segment in the request URL (format: {code .../temp/1/2/3/4})</li>
 * <li>(wiki): the name of the current wiki (extracted from the URL too)</li>
 * <li>(space): it's the 1st path segment in the request URL (format: {code .../temp/1/2/3/4})</li>
 * <li>(page): it's the 2nd path segment in the request URL (format: {code .../temp/1/2/3/4})</li>
 * <li>(file): it's the 4th path segment in the request URL (format: {code .../temp/1/2/3/4})</li>
 * </ul>
 * <p>
 * For example if the URL is {@code http://localhost:8080/xwiki/bin/temp/Main/WebHome/test/test.png} then the resource
 * will be fetched from {@code TMPDIR/temp/test/xwiki/Main/WebHome/test.png}.
 *
 * @version $Id: 60906b3ffea2f37f15f7d718b773aab336343a70 $
 * @since 2.4M1
 */
public clreplaced TempResourceAction extends XWikiAction {

    /**
     * URI pattern for this action.
     */
    public static final Pattern URI_PATTERN = Pattern.compile(".*?/temp/([^/]*+)/([^/]*+)/([^/]*+)/(.*+)");

    /**
     * The path separator.
     */
    private static final String PATH_SEPARATOR = "/";

    /**
     * The URL encoding.
     */
    private static final String URL_ENCODING = "UTF-8";

    /**
     * Logging support.
     */
    private static final Logger LOGGER = LoggerFactory.getLogger(TempResourceAction.clreplaced);

    /**
     * Used for detecting mime-types of files.
     */
    private Tika tika = new Tika();

    /**
     * Used to find the temporary dir.
     */
    private Environment environment = Utils.getComponent(Environment.clreplaced);

    @Override
    public String render(XWikiContext context) throws XWikiException {
        XWikiRequest request = context.getRequest();
        XWikiResponse response = context.getResponse();
        String uri = request.getRequestURI();
        // Locate the temporary file.
        File tempFile = getTemporaryFile(uri, context);
        if (null == tempFile) {
            throw new XWikiException(XWikiException.MODULE_XWIKI_APP, XWikiException.ERROR_XWIKI_APP_URL_EXCEPTION, "Invalid temporary resource URL");
        }
        // Write temporary file into response.
        response.setDateHeader("Last-Modified", tempFile.lastModified());
        String contentType = MimeTypes.OCTET_STREAM;
        try {
            contentType = this.tika.detect(tempFile);
        } catch (IOException ex) {
            LOGGER.warn(String.format("Unable to determine mime type for temporary resource [%s]", tempFile.getAbsolutePath()), ex);
        }
        response.setContentType(contentType);
        if ("1".equals(request.getParameter("force-download"))) {
            String fileName = StringUtils.defaultIfBlank(request.getParameter("force-filename"), tempFile.getName());
            fileName = Util.encodeURI(fileName, context).replaceAll("\\+", "%20");
            response.addHeader("Content-disposition", "attachment; filename*=utf-8''" + fileName);
        }
        try {
            response.setContentLength((int) tempFile.length());
            IOUtils.copy(FileUtils.openInputStream(tempFile), response.getOutputStream());
        } catch (IOException e) {
            throw new XWikiException(XWikiException.MODULE_XWIKI_APP, XWikiException.ERROR_XWIKI_APP_SEND_RESPONSE_EXCEPTION, "Exception while sending response", e);
        }
        return null;
    }

    /**
     * Returns the temporary file corresponding to the specified URI.
     *
     * @param uri request URI.
     * @param context xwiki context.
     * @return temporary file corresponding to the specified URI or null if no such file can be located.
     */
    protected File getTemporaryFile(String uri, XWikiContext context) {
        Matcher matcher = URI_PATTERN.matcher(uri);
        File result = null;
        if (matcher.find()) {
            List<String> pathSegments = new ArrayList<String>();
            // Add all the path segments.
            pathSegments.add("temp");
            // temp/module
            pathSegments.add(withMinimalURLEncoding(matcher.group(3)));
            // temp/module/wiki
            pathSegments.add(encodeURLPathSegment(context.getWikiId()));
            // temp/module/wiki/space
            pathSegments.add(withMinimalURLEncoding(matcher.group(1)));
            // temp/module/wiki/space/page
            pathSegments.add(withMinimalURLEncoding(matcher.group(2)));
            // Save the path prefix before adding the file path to be able to check if the file path tries to get out of
            // the parent folder (e.g. using '/../').
            String prefix = StringUtils.join(pathSegments, PATH_SEPARATOR);
            // temp/module/wiki/space/page/path/to/file.tmp
            for (String filePathSegment : matcher.group(4).split(PATH_SEPARATOR)) {
                pathSegments.add(withMinimalURLEncoding(filePathSegment));
            }
            String path = URI.create(StringUtils.join(pathSegments, PATH_SEPARATOR)).normalize().toString();
            if (path.startsWith(prefix)) {
                result = new File(this.environment.getTemporaryDirectory(), path);
                result = result.exists() ? result : null;
            }
        }
        return result;
    }

    /**
     * Keeps only minimal URL encoding. Currently, XWiki's URL factory over encodes the URLs in order to protect them
     * from XWiki 1.0 syntax parser.
     * <p>
     * This method also ensures that the path to the temporary file is fully encoded (has the canonical form) even if
     * the URL used to access the file is partially decoded (which can happen for instance when XWiki is behind Apache's
     * {@code mode_proxy} with {@code nocanon} option disabled).
     *
     * @param encodedPathSegment an encoded URL path segment
     * @return the given string with minimal URL encoding
     */
    private String withMinimalURLEncoding(String encodedPathSegment) {
        return encodeURLPathSegment(decodeURLPathSegment(encodedPathSegment));
    }

    private String encodeURLPathSegment(String segment) {
        try {
            return URLEncoder.encode(segment, URL_ENCODING);
        } catch (UnsupportedEncodingException e) {
            // This should never happen.
            return segment;
        }
    }

    private String decodeURLPathSegment(String encodedSegment) {
        try {
            return URLDecoder.decode(encodedSegment, URL_ENCODING);
        } catch (UnsupportedEncodingException e) {
            // This should never happen.
            return encodedSegment;
        }
    }
}

18 View Complete Implementation : TikaFileTypeDetector.java
Copyright Apache License 2.0
Author : sleuthkit
/**
 * @deprecated Use org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector
 * instead.
 */
@Deprecated
public clreplaced TikaFileTypeDetector {

    // calling detect() with this should be thread-safe
    private static final Tika tikaInst = new Tika();

    // how many bytes to preplaced in
    private final int BUFFER_SIZE = 64 * 1024;

    private final byte[] buffer = new byte[BUFFER_SIZE];

    /**
     * Detect the mime type of the preplaceded in file
     *
     * @param abstractFile
     *
     * @return mime type of detected format or null
     *
     * @deprecated Use FileTypeDetector.detect(AbstractFile file) instead.
     */
    @Deprecated
    public synchronized String detect(AbstractFile abstractFile) {
        try {
            byte[] buf;
            int len = abstractFile.read(buffer, 0, BUFFER_SIZE);
            if (len < BUFFER_SIZE) {
                buf = new byte[len];
                System.arraycopy(buffer, 0, buf, 0, len);
            } else {
                buf = buffer;
            }
            String mimetype = tikaInst.detect(buf, abstractFile.getName());
            // Remove tika's name out of the general types like msoffice and ooxml
            // NON-NLS
            return mimetype.replace("tika-", "");
        } catch (Exception ex) {
        // do nothing
        }
        return null;
    }

    /**
     * Validate if a given mime type is in the registry. For Tika, we remove the
     * string "tika" from all MIME names, e.g. use "application/x-msoffice" NOT
     * "application/x-tika-msoffice"
     *
     * @deprecated Use FileTypeDetector.isDetectable(String mimeType) instead.
     * @param mimeType Full string of mime type, e.g. "text/html"
     *
     * @return true if detectable
     */
    @Deprecated
    public boolean isMimeTypeDetectable(String mimeType) {
        boolean ret = false;
        SortedSet<MediaType> m = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry().getTypes();
        String[] split = mimeType.split("/");
        if (split.length == 2) {
            String type = split[0];
            String subtype = split[1];
            MediaType mediaType = new MediaType(type, subtype);
            ret = m.contains(mediaType);
        }
        return ret;
    }
}

18 View Complete Implementation : AdvancedTypeDetector.java
Copyright GNU General Public License v2.0
Author : SOBotics
public static String detectWithCustomDetector(String name) throws Exception {
    String config = "/org/apache/tika/mime/tika-mimetypes.xml";
    Detector detector = MimeTypesFactory.create(config);
    Detector custom = new Detector() {

        private static final long serialVersionUID = -5420638839201540749L;

        public MediaType detect(InputStream input, Metadata metadata) {
            String type = metadata.get("my-custom-type-override");
            if (type != null) {
                return MediaType.parse(type);
            } else {
                return MediaType.OCTET_STREAM;
            }
        }
    };
    Tika tika = new Tika(new CompositeDetector(custom, detector));
    return tika.detect(name);
}

18 View Complete Implementation : LuceneIndexer.java
Copyright GNU General Public License v2.0
Author : SOBotics
public clreplaced LuceneIndexer {

    private final Tika tika;

    private final IndexWriter writer;

    public LuceneIndexer(Tika tika, IndexWriter writer) {
        this.tika = tika;
        this.writer = writer;
    }

    public void indexDoreplacedent(File file) throws Exception {
        Doreplacedent doreplacedent = new Doreplacedent();
        doreplacedent.add(new Field("filename", file.getName(), Store.YES, Index.replacedYZED));
        doreplacedent.add(new Field("fulltext", tika.parseToString(file), Store.NO, Index.replacedYZED));
        writer.addDoreplacedent(doreplacedent);
    }
}

18 View Complete Implementation : LuceneIndexerExtended.java
Copyright GNU General Public License v2.0
Author : SOBotics
@SuppressWarnings("deprecation")
public clreplaced LuceneIndexerExtended {

    private final Tika tika;

    private final IndexWriter writer;

    public LuceneIndexerExtended(IndexWriter writer, Tika tika) {
        this.writer = writer;
        this.tika = tika;
    }

    public static void main(String[] args) throws Exception {
        try (IndexWriter writer = new IndexWriter(new SimpleFSDirectory(new File(args[0])), new Standardreplacedyzer(Version.LUCENE_30), MaxFieldLength.UNLIMITED)) {
            LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer);
            for (int i = 1; i < args.length; i++) {
                indexer.indexDoreplacedent(new File(args[i]));
            }
        }
    }

    public void indexDoreplacedent(File file) throws Exception {
        try (Reader fulltext = tika.parse(file)) {
            Doreplacedent doreplacedent = new Doreplacedent();
            doreplacedent.add(new Field("filename", file.getName(), Store.YES, Index.replacedYZED));
            doreplacedent.add(new Field("fulltext", fulltext));
            writer.addDoreplacedent(doreplacedent);
        }
    }
}

18 View Complete Implementation : MetadataAwareLuceneIndexer.java
Copyright GNU General Public License v2.0
Author : SOBotics
/**
 * Builds on the LuceneIndexer from Chapter 5 and adds indexing of Metadata.
 */
@SuppressWarnings("deprecation")
public clreplaced MetadataAwareLuceneIndexer {

    private Tika tika;

    private IndexWriter writer;

    public MetadataAwareLuceneIndexer(IndexWriter writer, Tika tika) {
        this.writer = writer;
        this.tika = tika;
    }

    public void indexContentSpecificMet(File file) throws Exception {
        Metadata met = new Metadata();
        try (InputStream is = new FileInputStream(file)) {
            tika.parse(is, met);
            Doreplacedent doreplacedent = new Doreplacedent();
            for (String key : met.names()) {
                String[] values = met.getValues(key);
                for (String val : values) {
                    doreplacedent.add(new Field(key, val, Store.YES, Index.replacedYZED));
                }
                writer.addDoreplacedent(doreplacedent);
            }
        }
    }

    public void indexWithDublinCore(File file) throws Exception {
        Metadata met = new Metadata();
        met.add(Metadata.CREATOR, "Manning");
        met.add(Metadata.CREATOR, "Tika in Action");
        met.set(Metadata.DATE, new Date());
        met.set(Metadata.FORMAT, tika.detect(file));
        met.set(DublinCore.SOURCE, file.toURI().toURL().toString());
        met.add(Metadata.SUBJECT, "File");
        met.add(Metadata.SUBJECT, "Indexing");
        met.add(Metadata.SUBJECT, "Metadata");
        met.set(Property.externalClosedChoise(Metadata.RIGHTS, "public", "private"), "public");
        try (InputStream is = new FileInputStream(file)) {
            tika.parse(is, met);
            Doreplacedent doreplacedent = new Doreplacedent();
            for (String key : met.names()) {
                String[] values = met.getValues(key);
                for (String val : values) {
                    doreplacedent.add(new Field(key, val, Store.YES, Index.replacedYZED));
                }
                writer.addDoreplacedent(doreplacedent);
            }
        }
    }
}

18 View Complete Implementation : StaticFileRenderer.java
Copyright Apache License 2.0
Author : 88250
/**
 * Static file renderer.
 *
 * @author <a href="http://88250.b3log.org">Liang Ding</a>
 * @version 2.0.1.0, Dec 3, 2019
 */
public clreplaced StaticFileRenderer extends AbstractResponseRenderer {

    /**
     * Logger.
     */
    private static final Logger LOGGER = LogManager.getLogger(StaticFileRenderer.clreplaced);

    private static final Tika TIKA = new Tika();

    @Override
    public void render(final RequestContext context) {
        final Response response = context.getResponse();
        try {
            String uri = context.requestURI();
            uri = URLs.decode(uri);
            uri = StringUtils.substringAfter(uri, Latkes.getStaticPath());
            byte[] bytes;
            if (!Latkes.isInJar()) {
                String path = StaticFileRenderer.clreplaced.getResource("/").getPath();
                if (StringUtils.contains(path, "/target/clreplacedes/") || StringUtils.contains(path, "/target/test-clreplacedes/")) {
                    // 开发时使用源码目录
                    path = StringUtils.replace(path, "/target/clreplacedes/", "/src/main/resources/");
                    path = StringUtils.replace(path, "/target/test-clreplacedes/", "/src/main/resources/");
                }
                path += uri;
                bytes = FileUtils.readFileToByteArray(new File(path));
            } else {
                try (final InputStream inputStream = StaticFileRenderer.clreplaced.getResourcereplacedtream(uri)) {
                    bytes = IOUtils.toByteArray(inputStream);
                }
            }
            final String contentType = TIKA.detect(uri);
            response.setContentType(contentType);
            response.sendBytes(bytes);
        } catch (final Exception e) {
            LOGGER.log(Level.ERROR, "Renders static file failed", e);
            response.sendError0(500);
        }
    }
}

18 View Complete Implementation : MimeTypeUtils.java
Copyright Apache License 2.0
Author : apache
/**
 * @author mattmann
 * @author bfoster
 *
 * <p>
 * This is a facade clreplaced to insulate CAS Metadata from its underlying Mime Type
 * substrate library, <a href="http://tika.apache.org/">Apache Tika</a>.
 * Any mime handling code should be placed in this utility clreplaced, and hidden
 * from the CAS Metadata clreplacedes that rely on it.
 * </p>
 */
public final clreplaced MimeTypeUtils {

    private static final String SEPARATOR = ";";

    public static final int HEADER_BYTE_SIZE = 1024;

    /* our Tika mime type registry */
    private MimeTypes mimeTypes;

    private Tika tika;

    /* whether or not magic should be employed or not */
    private boolean mimeMagic;

    /* static resource path for the mimeTypesFile */
    public final static String MIME_FILE_RES_PATH = "tika-mimetypes.xml";

    /* our log stream */
    private static final Logger LOG = Logger.getLogger(MimeTypeUtils.clreplaced.getName());

    public MimeTypeUtils() {
        this(MimeTypeUtils.clreplaced.getResourcereplacedtream(MIME_FILE_RES_PATH), true);
    }

    public MimeTypeUtils(String filePath) throws FileNotFoundException {
        this(filePath, true);
    }

    public MimeTypeUtils(String filePath, boolean magic) throws FileNotFoundException {
        this(new FileInputStream(filePath), magic);
    }

    public MimeTypeUtils(InputStream mimeIs, boolean magic) {
        try {
            this.mimeTypes = MimeTypesFactory.create(mimeIs);
            this.mimeMagic = magic;
            this.tika = new Tika(new DefaultDetector(this.mimeTypes));
        } catch (Exception e) {
            LOG.log(Level.SEVERE, "Failed to load MimeType Registry : " + e.getMessage(), e);
        }
    }

    /**
     * Cleans a {@link MimeType} name by removing out the actual
     * {@link MimeType}, from a string of the form:
     *
     * <pre>
     *           <primary type>/<sub type> ; < optional params
     * </pre>
     *
     * @param origType
     *            The original mime type string to be cleaned.
     * @return The primary type, and subtype, concatenated, e.g., the actual
     *         mime type.
     */
    public static String cleanMimeType(String origType) {
        if (origType == null) {
            return null;
        }
        // take the origType and split it on ';'
        String[] tokenizedMimeType = origType.split(SEPARATOR);
        if (tokenizedMimeType.length > 1) {
            // there was a ';' in there, take the first value
            return tokenizedMimeType[0];
        } else {
            // there wasn't a ';', so just return the orig type
            return origType;
        }
    }

    /**
     * Same as {@link #autoResolveContentType(String, String, byte[])}, but
     * this method preplacedes <code>null</code> as the initial type.
     *
     * @param url
     *            The String URL to use to check glob patterns.
     * @param data
     *            The byte data to potentially use in magic detection.
     * @return The String {@link MimeType}.
     */
    public String autoResolveContentType(String url, byte[] data) {
        return autoResolveContentType(null, url, data);
    }

    /**
     * A facade interface to trying all the possible mime type resolution
     * strategies available within Tika. First, the mime type provided in
     * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
     * Then the cleaned mime type is looked up in the underlying Tika
     * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
     * is found, then that mime type is used, otherwise {@link URL} resolution
     * is used to try and determine the mime type. If that means is
     * unsuccessful, and if <code>mime.type.magic</code> is enabled in
     * {@link NutchConfiguration}, then mime type magic resolution is used to
     * try and obtain a better-than-the-default approximation of the
     * {@link MimeType}.
     *
     * @param typeName
     *            The original mime type, returned from a {@link ProtocolOutput}.
     * @param url
     *            The given {@link URL}, that Nutch was trying to crawl.
     * @param data
     *            The byte data, returned from the crawl, if any.
     * @return The correctly, automatically guessed {@link MimeType} name.
     */
    public String autoResolveContentType(String typeName, String url, byte[] data) {
        MimeType type;
        String cleanedMimeType = null;
        try {
            cleanedMimeType = MimeTypeUtils.cleanMimeType(typeName) != null ? this.mimeTypes.forName(MimeTypeUtils.cleanMimeType(typeName)).getName() : null;
        } catch (MimeTypeException mte) {
        // Seems to be a malformed mime type name...
        }
        // first try to get the type from the cleaned type name
        try {
            type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType) : null;
        } catch (MimeTypeException e) {
            type = null;
        }
        // if returned null, or if it's the default type then try url resolution
        if (type == null || (type.getName().equals(MimeTypes.OCTET_STREAM))) {
            // If no mime-type header, or cannot find a corresponding registered
            // mime-type, then guess a mime-type from the url pattern
            try {
                type = mimeTypes.forName(tika.detect(url)) != null ? mimeTypes.forName(tika.detect(url)) : type;
            } catch (Exception e) {
            // MimeTypeException or IOException from tika.detect. Ignore.
            }
        }
        // if magic is enabled use mime magic to guess if the mime type returned
        // from the magic guess is different than the one that's already set so
        // far
        // if it is, and it's not the default mime type, then go with the mime
        // type
        // returned by the magic
        if (this.mimeMagic) {
            MimeType magicType;
            try {
                magicType = mimeTypes.forName(tika.detect(data));
            } catch (Exception e) {
                magicType = null;
            }
            if (magicType != null && !magicType.getName().equals(MimeTypes.OCTET_STREAM) && type != null && !type.getName().equals(magicType.getName())) {
                // If magic enabled and the current mime type differs from that
                // of the
                // one returned from the magic, take the magic mimeType
                type = magicType;
            }
            // if type is STILL null after all the resolution strategies, go for
            // the
            // default type
            if (type == null) {
                try {
                    type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
                } catch (Exception ignore) {
                }
            }
        }
        return type != null ? type.getName() : null;
    }

    /**
     * Facade interface to Tika's underlying
     * {@link tika.detect(String)} method.
     *
     * @param url
     *            A string representation of the doreplacedent {@link URL} to sense
     *            the {@link MimeType} for.
     * @return An appropriate {@link MimeType}, identified from the given
     *         Doreplacedent url in string form.
     */
    public String getMimeType(URL url) {
        try {
            return tika.detect(url);
        } catch (Exception e) {
            return null;
        }
    }

    /**
     * A facade interface to Tika's underlying {@link org.apache.tika.tika.detect(String)}
     * method.
     *
     * @param name
     *            The name of a valid {@link MimeType} in the Tika mime
     *            registry.
     * @return The object representation of the {@link MimeType}, if it exists,
     *         or null otherwise.
     */
    public String getMimeType(String name) {
        try {
            return tika.detect(name);
        } catch (Exception e) {
            LOG.log(Level.SEVERE, e.getMessage());
            return null;
        }
    }

    /**
     * Facade interface to Tika's underlying {@link org.apache.tika.Tika#detect(File)}
     * method.
     *
     * @param f
     *            The {@link File} to sense the {@link MimeType} for.
     * @return The {@link MimeType} of the given {@link File}, or null if it
     *         cannot be determined.
     */
    public String getMimeType(File f) {
        try {
            return tika.detect(f);
        } catch (Exception e) {
            System.err.println("\n\n\n");
            LOG.log(Level.SEVERE, e.getMessage());
            System.err.println("\n\n\n");
            return null;
        }
    }

    /**
     * Utility method to act as a facade to
     * {@link MimeTypes#getMimeType(byte[])}.
     *
     * @param data
     *            The byte data to get the {@link MimeType} for.
     * @return The String representation of the resolved {@link MimeType}, or
     *         null if a suitable {@link MimeType} is not found.
     */
    public String getMimeTypeByMagic(byte[] data) {
        try {
            return tika.detect(data);
        } catch (Exception e) {
            return null;
        }
    }

    public String getDescriptionForMimeType(String mimeType) {
        try {
            return this.mimeTypes.forName(mimeType).getDescription();
        } catch (Exception e) {
            LOG.log(Level.WARNING, "Failed to get description for mimetype " + mimeType + " : " + e.getMessage());
            return null;
        }
    }

    public String getSuperTypeForMimeType(String mimeType) {
        try {
            MediaType mediaType = this.mimeTypes.getMediaTypeRegistry().getSupertype(this.mimeTypes.forName(mimeType).getType());
            if (mediaType != null) {
                return mediaType.getType() + "/" + mediaType.getSubtype();
            } else {
                return null;
            }
        } catch (Exception e) {
            LOG.log(Level.WARNING, "Failed to get super-type for mimetype " + mimeType + " : " + e.getMessage());
            return null;
        }
    }

    /**
     * @return the mimeMagic
     */
    public boolean isMimeMagic() {
        return mimeMagic;
    }

    /**
     * @param mimeMagic the mimeMagic to set
     */
    public void setMimeMagic(boolean mimeMagic) {
        this.mimeMagic = mimeMagic;
    }

    public static byte[] readMagicHeader(InputStream stream) throws IOException {
        return readMagicHeader(stream, HEADER_BYTE_SIZE);
    }

    public static byte[] readMagicHeader(InputStream stream, int headerByteSize) throws IOException {
        if (stream == null) {
            throw new IllegalArgumentException("InputStream is missing");
        }
        byte[] bytes = new byte[headerByteSize];
        int totalRead = 0;
        int lastRead = stream.read(bytes);
        while (lastRead != -1) {
            totalRead += lastRead;
            if (totalRead == bytes.length) {
                return bytes;
            }
            lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
        }
        byte[] shorter = new byte[totalRead];
        System.arraycopy(bytes, 0, shorter, 0, totalRead);
        return shorter;
    }
}

18 View Complete Implementation : BinaryValidator.java
Copyright Apache License 2.0
Author : apache
public clreplaced BinaryValidator extends AbstractValidator {

    private static final long serialVersionUID = 1344152444666540361L;

    private static final ObjectMapper MAPPER = new ObjectMapper();

    private static final Tika TIKA = new Tika();

    static {
        TIKA.setMaxStringLength(-1);
    }

    @Override
    protected void doValidate(final PlainAttrValue attrValue) {
        // check Binary schemas MIME Type mismatches
        if (attrValue.getBinaryValue() != null) {
            byte[] binaryValue = attrValue.getBinaryValue();
            String mimeType = TIKA.detect(binaryValue);
            boolean valid = true;
            if (!mimeType.equals(attrValue.getAttr().getSchema().getMimeType())) {
                if (MediaType.TEXT_PLAIN.equals(mimeType) && MediaType.APPLICATION_JSON.equals(attrValue.getAttr().getSchema().getMimeType())) {
                    String decoded = new String(binaryValue).trim();
                    valid = (decoded.startsWith("{") && decoded.endsWith("}")) || (decoded.startsWith("[") && decoded.endsWith("]")) && isValidJSON(decoded);
                } else {
                    valid = false;
                }
            }
            if (!valid) {
                throw new InvalidPlainAttrValueException("Found MIME type: '" + mimeType + "', expecting: '" + attrValue.getAttr().getSchema().getMimeType() + '\'');
            }
        }
    }

    private static boolean isValidJSON(final String value) {
        try {
            MAPPER.readTree(value);
            return true;
        } catch (IOException e) {
            LOG.debug("Invalid JSON string: {}", value, e);
            return false;
        }
    }
}

18 View Complete Implementation : FileUtil.java
Copyright BSD 3-Clause "New" or "Revised" License
Author : Coding
/**
 * Created by phy on 2015/1/27.
 */
public abstract clreplaced FileUtil {

    private static MimetypesFileTypeMap mfm = new MimetypesFileTypeMap();

    private static Map<String, String> contentTypeMap = Maps.newHashMap();

    private static Tika tika = new Tika();

    static {
        contentTypeMap.put(".gitignore", "text/plain");
        contentTypeMap.put(".bowerrc", "text/plain");
        contentTypeMap.put(".editorconfig", "text/plain");
        contentTypeMap.put(".gitattributes", "text/plain");
        contentTypeMap.put("Rakefile", "text/x-extension-rake");
        contentTypeMap.put("Dockerfile", "text/x-extension-docker");
        contentTypeMap.put("Gemfile", "text/x-ruby-bundler-gemfile");
        contentTypeMap.put("Gemfile.lock", "text/x-ruby-bundler-gemfile-lock");
    }

    public static String getContentType(File file) {
        if (!file.isDirectory()) {
            String filename = file.getName();
            if (contentTypeMap.containsKey(filename)) {
                return contentTypeMap.get(filename);
            } else if (filename.indexOf('.') != -1) {
                String contentType = mfm.getContentType(filename);
                if (!contentType.equals("application/octet-stream")) {
                    return contentType;
                } else
                    return detectContentTypeByContent(file);
            } else {
                return detectContentTypeByContent(file);
            }
        } else {
            return null;
        }
    }

    private static String detectContentTypeByContent(File file) {
        if (file.length() == 0) {
            return "text/plain";
        } else {
            try {
                return tika.detect(file);
            } catch (Exception e) {
                return "application/octet-stream";
            }
        }
    }
}