If you are trying to read paraquet file, then it can be done by adding these 2 dependencies in pom file.
<dependencies>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
public class readParaquetFile {
private static Path path = new Path("C:\\Users\\deepak.mathpal\\Downloads\\userdata1.parquet");
private static void printGroup(Group g) {
int fieldCount = g.getType().getFieldCount();
for (int field = 0; field < fieldCount; field++) {
int valueCount = g.getFieldRepetitionCount(field);
Type fieldType = g.getType().getType(field);
String fieldName = fieldType.getName();
for (int index = 0; index < valueCount; index++) {
if (fieldType.isPrimitive()) {
System.out.println(fieldName + " " + g.getValueToString(field, index));
}
}
}
System.out.println("");
}
public static void main(String[] args) throws IllegalArgumentException {
Configuration conf = new Configuration();
try {
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
PageReadStore pages = null;
try {
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
System.out.println("Number of rows: " + rows);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final Object g = recordReader.read();
printGroup((Group) g);
}
}
} finally {
r.close();
}
} catch (IOException e) {
System.out.println("Error reading parquet file.");
e.printStackTrace();
}
}
}
No comments:
Post a Comment