Skip to content

Instantly share code, notes, and snippets.

@jster1357
Created May 30, 2024 19:54
Show Gist options
  • Save jster1357/3acdf74a08584fc2cd42d06c15515de9 to your computer and use it in GitHub Desktop.
Save jster1357/3acdf74a08584fc2cd42d06c15515de9 to your computer and use it in GitHub Desktop.
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
5994000001|53532433|O|176627.30|1997-03-30|1-URGENT|Clerk#000632485|0|ly bold sentiments integrate doggedly? furious
5994000002|9601419|F|292586.12|1993-10-16|3-MEDIUM|Clerk#000171252|0|special pinto beans; furiously even ideas sle
5994000003|41037313|F|218489.85|1994-07-28|2-HIGH|Clerk#000218085|0|accounts may cajole. final dinos wake f
5994000004|14091148|O|52208.87|1998-02-22|2-HIGH|Clerk#000348944|0|ct furiously around the care
5994000005|129915457|F|201126.75|1993-07-18|1-URGENT|Clerk#000687363|0|ainst the slyly special courts. quickly ironi
import pyarrow.csv as csv
import pyarrow as pa
import pyarrow.fs
import gcsfs
import pyarrow.parquet as pq
##set read/write paths
gcs_read_path = 'my object path'
gcs_write_path = 'my object path'
##set file service to read from
fs = gcsfs.GCSFileSystem(project='my-project')
# Read options
read_options = csv.ReadOptions(
column_names=[ 'o_orderkey', 'o_custkey','o_orderstatus','o_totalprice',"o_orderdate", "o_orderpriority", "o_clerk", "o_shippriority", "o_comment"]
)
# Parse options
parse_options = csv.ParseOptions(
delimiter="|"
)
#conversion options
convert_options = csv.ConvertOptions(
column_types={"o_totalprice": pa.decimal128(10, 2), \
"o_orderdate": pa.date32()}
)
#open file with options
with fs.open(gcs_read_path, 'rb') as f:
table = csv.read_csv(f, read_options=read_options,parse_options=parse_options,convert_options=convert_options)
f.close()
with fs.open(gcs_write_path,'wb') as f:
pq.write_table(table,f)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment