Search This Blog

Breaking

Tuesday, 29 June 2021

How to read paraquet file in java?

 

If you are trying to read paraquet file, then it can be done by adding these 2 dependencies in pom file.






<dependencies>

 <dependency>

 <groupId>org.apache.parquet</groupId>

 <artifactId>parquet-hadoop</artifactId>

 <version>1.9.0</version>

 </dependency>


 <dependency>

 <groupId>org.apache.hadoop</groupId>

 <artifactId>hadoop-common</artifactId>

 <version>2.7.0</version>

 </dependency>

</dependencies>


public class readParaquetFile {

    private static Path path = new Path("C:\\Users\\deepak.mathpal\\Downloads\\userdata1.parquet");
    
    private static void printGroup(Group g) {
        
       int fieldCount = g.getType().getFieldCount();
        
        for (int field = 0; field < fieldCount; field++) {

            int valueCount = g.getFieldRepetitionCount(field);

            Type fieldType = g.getType().getType(field);

            String fieldName = fieldType.getName();

            for (int index = 0; index < valueCount; index++) {

                if (fieldType.isPrimitive()) {

                    System.out.println(fieldName + " " + g.getValueToString(field, index));

                }

            }

        }

        System.out.println("");

    }

    public static void main(String[] args) throws IllegalArgumentException {

        Configuration conf = new Configuration();

        try {

            ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);

            MessageType schema = readFooter.getFileMetaData().getSchema();

            ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);

            PageReadStore pages = null;

            try {

                while (null != (pages = r.readNextRowGroup())) {

                    final long rows = pages.getRowCount();

                    System.out.println("Number of rows: " + rows);

                    final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);

                    final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));

                    for (int i = 0; i < rows; i++) {

                        final Object g = recordReader.read();

                        printGroup((Group) g);

                    }

                }

            } finally {

                r.close();

           }

        } catch (IOException e) {

            System.out.println("Error reading parquet file.");

            e.printStackTrace();

        }

    }

}

No comments:

Post a Comment