added csv_split

mjhea0 · mjhea0 · commit 9828e28fc53f · 2014-05-30T09:24:44.000-05:00
diff --git a/12_csv_split.py b/12_csv_split.py
@@ -1,167 +1,131 @@
-### WIP
-
 import sys
 import os
-import getopt
 import csv
+import argparse
 
 """
-    Splits a CSV file into multiple pieces based on command line arguments.
+
+Splits a CSV file into multiple pieces based on command line arguments.
 
     Arguments:
-        `-h`: help file of usage of the script  
-        `-i`: input file name 
-        `-o`: output file, A %s-style template for the numbered output files.
-        `-r`: row limit to split 
-        `-c`: A %s-style template for the numbered output files.
+
+    `-h`: help file of usage of the script
+    `-i`: input file name
+    `-o`: output file name
+    `-r`: row limit to split
 
     Default settings:
-        `output_path` is the current directory
-        `keep_headers` is on (headers will be kept)
-        `delimeter` is ,
+
+    `output_path` is the current directory
+    headers are displayed on each split file
+    the default delimeter is a comma
 
     Example usage:
-        # split by every 10000 rows
-        >> python 12_csv_split.py -i input.csv -o rownumber -r 10000   
-        # split by unique items in column 0 
-        >> python 12_csv_split.py -i input.csv -o userid -c 0   
-        # access help
-        >> python 12_csv_split.py -h for help 
-    
+
+    ```
+    # split csv by every 100 rows
+    >> python csv_split.py -i input.csv -o output -r 100
+    ```
+
 """
 
-def main(argv):
-
-    argument_dict = grab_command_line_arguments(argv)
-    parse_file(argument_dict)
-
-
-def grab_command_line_arguments(argv):
-
-    # global variables
-    inputfile = ''
-    outputfile = ''
-    rowlimit = ''
-    columnindex = ''  
-    argument_dict = {} 
-
-    # grab arguments
-    opts, args = getopt.getopt(argv,"hi:o:r:c:",["ifile=","ofile=","rowlimit=","columnindex="])
-
-    # end if no arguments provided
-    if not opts:
-        print "No options provided. Try again. Use `-h` for help."
-        sys.exit()
-
-    # grab arguments
-    for opt, arg in opts:
-        if opt == '-h':
-            print 'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
-            sys.exit()
-        elif opt in ("-i", "--ifile"):
-            inputfile = arg
-        elif opt in ("-o", "--ofile"):
-            outputfile = arg
-        elif opt in ("-r", "--rowlimit"):
-            rowlimit = arg
-        elif opt in ("-c", "--columnindex"):
-            columnindex = arg
-
-    # Output arguments
-    print "\nArguments:"
-    if inputfile:
-        argument_dict["input_file"] = inputfile
-        print "Input file is '{}'".format(inputfile)
-    else:
-        "Please enter an input file."
-    if outputfile:
-        argument_dict["output_file"] = outputfile
-        print "Output file is '{}'".format(outputfile)
-    else:
-        print "Please enter an output file."
-    if rowlimit:
-        argument_dict["rowlimit"] = rowlimit
-        print "Rowlimit is '{}'".format(rowlimit)
-    if columnindex:
-        argument_dict["columnindex"] = columnindex
-        print "Columnindex is '{}'".format(columnindex) 
-    if rowlimit and columnindex:
-        print "Please use either a rowlimit or columnlimit, not both."
-        sys.exit()
-    if not rowlimit or columnindex:
-        print "Please enter either a rowlimit or columnlimit."
-        sys.exit()
-
-    # to do - check to make sure file, rowlimit, and columnlimit exist
-    print argument_dict
-    return argument_dict
-
-
-def parse_file(argument_dict):
-
-    #split csv file by certain rownumber 
-    if argument_dict["rowlimit"]:           
-        rowlimit = int(argument_dict["rowlimit"])
-        output_name_file = "{}.csv".format(argument_dict["output_file"])
-        output_path='.'
-        keep_headers=True
-        delimiter=','
-        filehandler = open(argument_dict["input_file"],'r')
-        reader = csv.reader(filehandler, delimiter=delimiter)
-        current_piece = 1
-        current_out_path = os.path.join(
-            output_path,
-            output_name_file
+
+def get_arguments():
+    """Grab user supplied arguments using the argparse library."""
+
+    # Use arparse to get command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_file", required=True,
+                        help="csv input file (with extension)", type=str)
+    parser.add_argument("-o", "--output_file", required=True,
+                        help="csv output file (without extension)", type=str)
+    parser.add_argument("-r", "--row_limit", required=True,
+                        help="row limit to split csv at", type=int)
+    args = parser.parse_args()
+
+    # Check if the input_file exits
+    is_valid_file(parser, args.input_file)
+
+    # Check if the input_file is valid
+    is_valid_csv(parser, args.input_file, args.row_limit)
+
+    return args.input_file, args.output_file, args.row_limit
+
+
+def is_valid_file(parser, file_name):
+    """Ensure that the input_file exists."""
+    if not os.path.exists(file_name):
+        parser.error("The file '{}' does not exist!".format(file_name))
+        sys.exit(1)
+
+
+def is_valid_csv(parser, file_name, row_limit):
+    """
+    Ensure that the # of rows in the input_file
+    is greater than the row_limit.
+    """
+    row_count = 0
+    for row in csv.reader(open(file_name)):
+        row_count += 1
+    # Note: You could also use a generator expression
+    # and the sum() function to count the rows:
+    # row_count = sum(1 for row in csv.reader(open(file_name)))
+    if row_limit > row_count:
+        parser.error(
+            "The 'row_count' of '{}' is > the number of rows in '{}'!"
+            .format(row_limit, file_name)
         )
-        current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
-        current_limit = rowlimit
-        if keep_headers:
-            headers = reader.next()
-            current_out_writer.writerow(headers)
-        for i, row in enumerate(reader):
-            if i + 1 > current_limit:
-                current_piece += 1
-                current_limit = rowlimit * current_piece
-                current_out_path = os.path.join(
-                    output_path,
-                    output_name_file
-                )
-            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
-
-# elif columnindex:               #split csv file accrording to unique values of certain column,it's like filter only certain item in excel 
-# itemlist = []
-# columnindex = int(columnindex)
-# output_name_template= outputfile+'_%s.csv'
-# output_path='.'
-# keep_headers=True
-# delimiter=','
-# filehandler = open(inputfile,'r')
-# reader = csv.reader(filehandler, delimiter=delimiter)
-# if keep_headers:
-#   headers = reader.next()
-
-# for i, row in enumerate(reader):
-
-#   current_out_path = os.path.join(
-#        output_path,
-#        output_name_template  % row[columnindex] )
-#   if row[columnindex] not in itemlist:
-#      try:
-#          current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
-#      except IOError:
-#          continue
-#      else:
-#          itemlist.append(row[columnindex])
-#          if keep_headers:
-#              current_out_writer.writerow(headers)
-#          current_out_writer.writerow(row)
-#   else:
-#      current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
-#      current_out_writer.writerow(row)
-# print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
-# else:
-# print "oops, please check instruction of script by >>./csvsplit.py -h"
+        sys.exit(1)
+
+
+def parse_file(arguments):
+    """
+    Splits the CSV into multiple files or chunks based on the row_limit.
+    Then create new CSV files.
+    """
+    input_file = arguments[0]
+    output_file = arguments[1]
+    row_limit = arguments[2]
+    output_path = '.'  # Current directory
+
+    # Read CSV, split into list of lists
+    with open(input_file, 'r') as input_csv:
+        datareader = csv.reader(input_csv)
+        all_rows = []
+        for row in datareader:
+            all_rows.append(row)
+
+        # Remove header
+        header = all_rows.pop(0)
+
+        # Split list of list into chunks
+        current_chunk = 0
+        for i in range(0, len(all_rows), row_limit):  # Loop through list
+            chunk = all_rows[i:i + row_limit]  # Create single chunk
+
+            current_output = os.path.join(  # Create new output file
+                output_path,
+                "{}-{}.csv".format(output_file, current_chunk)
+            )
+
+            # Add header
+            chunk.insert(0, header)
+
+            # Write chunk to output file
+            with open(current_output, 'w') as output_csv:
+                writer = csv.writer(output_csv)
+                writer = writer.writerows(chunk)
+
+            # Output info
+            print ""
+            print "Chunk # {}:".format(current_chunk)
+            print "Filepath: {}".format(current_output)
+            print "# of rows: {}".format(len(chunk))
+
+            # Create new chunk
+            current_chunk += 1
 
 
 if __name__ == "__main__":
-   main(sys.argv[1:])
+    arguments = get_arguments()
+    parse_file(arguments)