Search This Blog

Tuesday, 29 June 2021

How to read paraquet file in java?

 

If you are trying to read paraquet file, then it can be done by adding these 2 dependencies in pom file.






<dependencies>

 <dependency>

 <groupId>org.apache.parquet</groupId>

 <artifactId>parquet-hadoop</artifactId>

 <version>1.9.0</version>

 </dependency>


 <dependency>

 <groupId>org.apache.hadoop</groupId>

 <artifactId>hadoop-common</artifactId>

 <version>2.7.0</version>

 </dependency>

</dependencies>


public class readParaquetFile {

    private static Path path = new Path("C:\\Users\\deepak.mathpal\\Downloads\\userdata1.parquet");
    
    private static void printGroup(Group g) {
        
       int fieldCount = g.getType().getFieldCount();
        
        for (int field = 0; field < fieldCount; field++) {

            int valueCount = g.getFieldRepetitionCount(field);

            Type fieldType = g.getType().getType(field);

            String fieldName = fieldType.getName();

            for (int index = 0; index < valueCount; index++) {

                if (fieldType.isPrimitive()) {

                    System.out.println(fieldName + " " + g.getValueToString(field, index));

                }

            }

        }

        System.out.println("");

    }

    public static void main(String[] args) throws IllegalArgumentException {

        Configuration conf = new Configuration();

        try {

            ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);

            MessageType schema = readFooter.getFileMetaData().getSchema();

            ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);

            PageReadStore pages = null;

            try {

                while (null != (pages = r.readNextRowGroup())) {

                    final long rows = pages.getRowCount();

                    System.out.println("Number of rows: " + rows);

                    final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);

                    final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));

                    for (int i = 0; i < rows; i++) {

                        final Object g = recordReader.read();

                        printGroup((Group) g);

                    }

                }

            } finally {

                r.close();

           }

        } catch (IOException e) {

            System.out.println("Error reading parquet file.");

            e.printStackTrace();

        }

    }

}

No comments:

Post a Comment