Geek squad: Python Code

---------------------- V9 Code in kaffle to run mixed case

Ver0

import pandas as pd
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() + "_" + match.group(2).lower()
#df = pd.read_csv('../input/Accounts_Payable_csv.csv') # load data from csv
words = all['Column Physical_Name']
results = [re.sub(REG, snake, w, 0) for w in words]
print("\n".join(results))
print('Writing results...')
resultFile = open('test.csv', 'w')
resultFile.write(pprint.pformat(results))
resultFile.close()
print('Done.')

Ver1

import pandas as pd

import re,pprint

# read the file and parse the first and only sheet (need python xlrd module)

f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')

#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)

#print(all)

all = f.parse("Bistrack")

REG = r"(.+?)([A-Z])"

def snake(match):

return match.group(1).lower() + "_" + match.group(2).lower()

for i, row in all.iterrows():

print(row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option'])

Ver3

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re,pprint

# read the file and parse the first and only sheet (need python xlrd module)

f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')

#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)

#print(all)

all = f.parse("Bistrack")

REG = r"(.+?)([A-Z])"

def snake(match):

return match.group(1).lower() + "_" + match.group(2).lower()

for i, row in all.iterrows():

ind= tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])

print(ind)

Ver4

f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')

#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)

#print(all)

all = f.parse("Bistrack")

REG = r"(.+?)([A-Z])"

def snake(match):

return match.group(1).lower() + "_" + match.group(2).lower()

print('Writing results...')

writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')

#resultFile = open('test_all.csv', 'w')

for i, row in all.iterrows():

df = pd.DataFrame({row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']})

# results= tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])

# resultFile.write(pprint.pformat(results))

df.to_excel(writer, sheet_name='Sheet1')

writer.save()

#resultFile.close()

print('Done.')

#print(row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option'])

------------- Final Version

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re,pprint

# read the file and parse the first and only sheet (need python xlrd module)

f = pd.ExcelFile('../input/new-version/Dimensions_Source_004.xlsx')

#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)

#print(all)

all = f.parse("Bistrack")

REG = r"(.+?)([A-Z])"

def snake(match):

return match.group(1).lower() + "_" + match.group(2).lower()

resultn =[]

print('Writing results...')

#resultFile = open('test_all.csv', 'w')

for i, row in all.iterrows():

#df = pd.DataFrame({row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']})

results = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])

resultn.append(results)

#df = pd.DataFrame({'Data': results})

#resultFile.write(pprint.pformat(results))

df = pd.DataFrame({'Data': resultn})

writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')

df.to_excel(writer, sheet_name='Sheet1')

writer.save()

#resultFile.close()

print('Done.')

Manual work in spreadsheet after it is prepared from the script above

Remove first row

remove first column

Replace all " ' "

Replace " ( " in first column

Replace " ) " in Last Column

---Better Function

https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case

def convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

There's an inflection library in the package index that can handle these things for you. In this case, you'd be looking for inflection.underscore():

>>> inflection.underscore('CamelCase')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/testcc2/Dimensions_Source_004.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
def convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

REG = r"(.+?)([A-Z])"
def snake(match):
    return match.group(1).lower() + " " + match.group(2).lower()
resultn =[]
resultnConvert=[]
print('Writing results...')
#resultFile = open('test_all.csv', 'w')
for i, row in all.iterrows():
     #results = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
     resultc = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], convert(row['Column Physical_Name']) , row['Physical Data Type'], row['Null Option']])
     resultnConvert.append(resultc)
df = pd.DataFrame({'Data': resultnConvert})
writer = pd.ExcelWriter('pandas_convert.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', header=False, index=False)
writer.save()
#resultFile.close()
print('Done.')

Geek squad

Wednesday, October 2, 2019

Python Code - onenote lbm

No comments:

Post a Comment