Skip to content

Commit 9828e28

Browse files
committed
added csv_split
1 parent 8b65a7c commit 9828e28

File tree

1 file changed

+114
-150
lines changed

1 file changed

+114
-150
lines changed

12_csv_split.py

Lines changed: 114 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -1,167 +1,131 @@
1-
### WIP
2-
31
import sys
42
import os
5-
import getopt
63
import csv
4+
import argparse
75

86
"""
9-
Splits a CSV file into multiple pieces based on command line arguments.
7+
8+
Splits a CSV file into multiple pieces based on command line arguments.
109
1110
Arguments:
12-
`-h`: help file of usage of the script
13-
`-i`: input file name
14-
`-o`: output file, A %s-style template for the numbered output files.
15-
`-r`: row limit to split
16-
`-c`: A %s-style template for the numbered output files.
11+
12+
`-h`: help file of usage of the script
13+
`-i`: input file name
14+
`-o`: output file name
15+
`-r`: row limit to split
1716
1817
Default settings:
19-
`output_path` is the current directory
20-
`keep_headers` is on (headers will be kept)
21-
`delimeter` is ,
18+
19+
`output_path` is the current directory
20+
headers are displayed on each split file
21+
the default delimeter is a comma
2222
2323
Example usage:
24-
# split by every 10000 rows
25-
>> python 12_csv_split.py -i input.csv -o rownumber -r 10000
26-
# split by unique items in column 0
27-
>> python 12_csv_split.py -i input.csv -o userid -c 0
28-
# access help
29-
>> python 12_csv_split.py -h for help
30-
24+
25+
```
26+
# split csv by every 100 rows
27+
>> python csv_split.py -i input.csv -o output -r 100
28+
```
29+
3130
"""
3231

33-
def main(argv):
34-
35-
argument_dict = grab_command_line_arguments(argv)
36-
parse_file(argument_dict)
37-
38-
39-
def grab_command_line_arguments(argv):
40-
41-
# global variables
42-
inputfile = ''
43-
outputfile = ''
44-
rowlimit = ''
45-
columnindex = ''
46-
argument_dict = {}
47-
48-
# grab arguments
49-
opts, args = getopt.getopt(argv,"hi:o:r:c:",["ifile=","ofile=","rowlimit=","columnindex="])
50-
51-
# end if no arguments provided
52-
if not opts:
53-
print "No options provided. Try again. Use `-h` for help."
54-
sys.exit()
55-
56-
# grab arguments
57-
for opt, arg in opts:
58-
if opt == '-h':
59-
print 'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
60-
sys.exit()
61-
elif opt in ("-i", "--ifile"):
62-
inputfile = arg
63-
elif opt in ("-o", "--ofile"):
64-
outputfile = arg
65-
elif opt in ("-r", "--rowlimit"):
66-
rowlimit = arg
67-
elif opt in ("-c", "--columnindex"):
68-
columnindex = arg
69-
70-
# Output arguments
71-
print "\nArguments:"
72-
if inputfile:
73-
argument_dict["input_file"] = inputfile
74-
print "Input file is '{}'".format(inputfile)
75-
else:
76-
"Please enter an input file."
77-
if outputfile:
78-
argument_dict["output_file"] = outputfile
79-
print "Output file is '{}'".format(outputfile)
80-
else:
81-
print "Please enter an output file."
82-
if rowlimit:
83-
argument_dict["rowlimit"] = rowlimit
84-
print "Rowlimit is '{}'".format(rowlimit)
85-
if columnindex:
86-
argument_dict["columnindex"] = columnindex
87-
print "Columnindex is '{}'".format(columnindex)
88-
if rowlimit and columnindex:
89-
print "Please use either a rowlimit or columnlimit, not both."
90-
sys.exit()
91-
if not rowlimit or columnindex:
92-
print "Please enter either a rowlimit or columnlimit."
93-
sys.exit()
94-
95-
# to do - check to make sure file, rowlimit, and columnlimit exist
96-
print argument_dict
97-
return argument_dict
98-
99-
100-
def parse_file(argument_dict):
101-
102-
#split csv file by certain rownumber
103-
if argument_dict["rowlimit"]:
104-
rowlimit = int(argument_dict["rowlimit"])
105-
output_name_file = "{}.csv".format(argument_dict["output_file"])
106-
output_path='.'
107-
keep_headers=True
108-
delimiter=','
109-
filehandler = open(argument_dict["input_file"],'r')
110-
reader = csv.reader(filehandler, delimiter=delimiter)
111-
current_piece = 1
112-
current_out_path = os.path.join(
113-
output_path,
114-
output_name_file
32+
33+
def get_arguments():
34+
"""Grab user supplied arguments using the argparse library."""
35+
36+
# Use arparse to get command line arguments
37+
parser = argparse.ArgumentParser()
38+
parser.add_argument("-i", "--input_file", required=True,
39+
help="csv input file (with extension)", type=str)
40+
parser.add_argument("-o", "--output_file", required=True,
41+
help="csv output file (without extension)", type=str)
42+
parser.add_argument("-r", "--row_limit", required=True,
43+
help="row limit to split csv at", type=int)
44+
args = parser.parse_args()
45+
46+
# Check if the input_file exits
47+
is_valid_file(parser, args.input_file)
48+
49+
# Check if the input_file is valid
50+
is_valid_csv(parser, args.input_file, args.row_limit)
51+
52+
return args.input_file, args.output_file, args.row_limit
53+
54+
55+
def is_valid_file(parser, file_name):
56+
"""Ensure that the input_file exists."""
57+
if not os.path.exists(file_name):
58+
parser.error("The file '{}' does not exist!".format(file_name))
59+
sys.exit(1)
60+
61+
62+
def is_valid_csv(parser, file_name, row_limit):
63+
"""
64+
Ensure that the # of rows in the input_file
65+
is greater than the row_limit.
66+
"""
67+
row_count = 0
68+
for row in csv.reader(open(file_name)):
69+
row_count += 1
70+
# Note: You could also use a generator expression
71+
# and the sum() function to count the rows:
72+
# row_count = sum(1 for row in csv.reader(open(file_name)))
73+
if row_limit > row_count:
74+
parser.error(
75+
"The 'row_count' of '{}' is > the number of rows in '{}'!"
76+
.format(row_limit, file_name)
11577
)
116-
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
117-
current_limit = rowlimit
118-
if keep_headers:
119-
headers = reader.next()
120-
current_out_writer.writerow(headers)
121-
for i, row in enumerate(reader):
122-
if i + 1 > current_limit:
123-
current_piece += 1
124-
current_limit = rowlimit * current_piece
125-
current_out_path = os.path.join(
126-
output_path,
127-
output_name_file
128-
)
129-
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
130-
131-
# elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel
132-
# itemlist = []
133-
# columnindex = int(columnindex)
134-
# output_name_template= outputfile+'_%s.csv'
135-
# output_path='.'
136-
# keep_headers=True
137-
# delimiter=','
138-
# filehandler = open(inputfile,'r')
139-
# reader = csv.reader(filehandler, delimiter=delimiter)
140-
# if keep_headers:
141-
# headers = reader.next()
142-
143-
# for i, row in enumerate(reader):
144-
145-
# current_out_path = os.path.join(
146-
# output_path,
147-
# output_name_template % row[columnindex] )
148-
# if row[columnindex] not in itemlist:
149-
# try:
150-
# current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
151-
# except IOError:
152-
# continue
153-
# else:
154-
# itemlist.append(row[columnindex])
155-
# if keep_headers:
156-
# current_out_writer.writerow(headers)
157-
# current_out_writer.writerow(row)
158-
# else:
159-
# current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
160-
# current_out_writer.writerow(row)
161-
# print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
162-
# else:
163-
# print "oops, please check instruction of script by >>./csvsplit.py -h"
78+
sys.exit(1)
79+
80+
81+
def parse_file(arguments):
82+
"""
83+
Splits the CSV into multiple files or chunks based on the row_limit.
84+
Then create new CSV files.
85+
"""
86+
input_file = arguments[0]
87+
output_file = arguments[1]
88+
row_limit = arguments[2]
89+
output_path = '.' # Current directory
90+
91+
# Read CSV, split into list of lists
92+
with open(input_file, 'r') as input_csv:
93+
datareader = csv.reader(input_csv)
94+
all_rows = []
95+
for row in datareader:
96+
all_rows.append(row)
97+
98+
# Remove header
99+
header = all_rows.pop(0)
100+
101+
# Split list of list into chunks
102+
current_chunk = 0
103+
for i in range(0, len(all_rows), row_limit): # Loop through list
104+
chunk = all_rows[i:i + row_limit] # Create single chunk
105+
106+
current_output = os.path.join( # Create new output file
107+
output_path,
108+
"{}-{}.csv".format(output_file, current_chunk)
109+
)
110+
111+
# Add header
112+
chunk.insert(0, header)
113+
114+
# Write chunk to output file
115+
with open(current_output, 'w') as output_csv:
116+
writer = csv.writer(output_csv)
117+
writer = writer.writerows(chunk)
118+
119+
# Output info
120+
print ""
121+
print "Chunk # {}:".format(current_chunk)
122+
print "Filepath: {}".format(current_output)
123+
print "# of rows: {}".format(len(chunk))
124+
125+
# Create new chunk
126+
current_chunk += 1
164127

165128

166129
if __name__ == "__main__":
167-
main(sys.argv[1:])
130+
arguments = get_arguments()
131+
parse_file(arguments)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy