----------------------
V9 Code in kaffle to run mixed case
Ver0
import pandas as pd
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() + "_" + match.group(2).lower()
#df = pd.read_csv('../input/Accounts_Payable_csv.csv') # load data from csv
words = all['Column Physical_Name']
results = [re.sub(REG, snake, w, 0) for w in words]
print("\n".join(results))
print('Writing results...')
resultFile = open('test.csv', 'w')
resultFile.write(pprint.pformat(results))
resultFile.close()
print('Done.')
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() + "_" + match.group(2).lower()
#df = pd.read_csv('../input/Accounts_Payable_csv.csv') # load data from csv
words = all['Column Physical_Name']
results = [re.sub(REG, snake, w, 0) for w in words]
print("\n".join(results))
print('Writing results...')
resultFile = open('test.csv', 'w')
resultFile.write(pprint.pformat(results))
resultFile.close()
print('Done.')
Ver1
import pandas as pd
import re,pprint
# read the file and
parse the first and only sheet (need python xlrd module)
f =
pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all =
f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all =
f.parse("Bistrack")
REG =
r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() +
"_" + match.group(2).lower()
for i, row in
all.iterrows():
print(row['SubjectArea'], row['Source'],
row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0),
row['Physical Data Type'], row['Null Option'])
Ver3
import numpy as np #
linear algebra
import pandas as pd
# data processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and
parse the first and only sheet (need python xlrd module)
f =
pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all =
f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all =
f.parse("Bistrack")
REG =
r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() +
"_" + match.group(2).lower()
for i, row in
all.iterrows():
ind= tuple([row['SubjectArea'],
row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column
Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
print(ind)
Ver4
f =
pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all =
f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all =
f.parse("Bistrack")
REG =
r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() +
"_" + match.group(2).lower()
print('Writing
results...')
writer =
pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
#resultFile =
open('test_all.csv', 'w')
for i, row in
all.iterrows():
df = pd.DataFrame({row['SubjectArea'],
row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column
Physical_Name'], 0), row['Physical Data Type'], row['Null Option']})
# results= tuple([row['SubjectArea'],
row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column
Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
#
resultFile.write(pprint.pformat(results))
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
#resultFile.close()
print('Done.')
#print(row['SubjectArea'],
row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column
Physical_Name'], 0), row['Physical Data Type'], row['Null Option'])
------------- Final
Version
import numpy as np # linear
algebra
import pandas as pd # data
processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and parse the
first and only sheet (need python xlrd module)
f =
pd.ExcelFile('../input/new-version/Dimensions_Source_004.xlsx')
#all =
f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all =
f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() +
"_" + match.group(2).lower()
resultn =[]
print('Writing results...')
#resultFile =
open('test_all.csv', 'w')
for i, row in all.iterrows():
#df = pd.DataFrame({row['SubjectArea'],
row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column
Physical_Name'], 0), row['Physical Data Type'], row['Null Option']})
results = tuple([row['SubjectArea'],
row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column
Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
resultn.append(results)
#df = pd.DataFrame({'Data': results})
#resultFile.write(pprint.pformat(results))
df = pd.DataFrame({'Data':
resultn})
writer =
pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
df.to_excel(writer,
sheet_name='Sheet1')
writer.save()
#resultFile.close()
print('Done.')
Manual work in
spreadsheet after it is prepared from the script above
Remove first row
remove first column
Replace all " '
"
Replace " (
" in first column
Replace " )
" in Last Column
---Better Function
def convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
There's an inflection library in
the package index that can handle these things for you. In this case, you'd be
looking for inflection.underscore():
>>> inflection.underscore('CamelCase')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/testcc2/Dimensions_Source_004.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
def convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/testcc2/Dimensions_Source_004.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
def convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
REG = r"(.+?)([A-Z])"
def snake(match):
return match.group(1).lower() + " " + match.group(2).lower()
resultn =[]
resultnConvert=[]
print('Writing results...')
#resultFile = open('test_all.csv', 'w')
for i, row in all.iterrows():
#results = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
resultc = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], convert(row['Column Physical_Name']) , row['Physical Data Type'], row['Null Option']])
resultnConvert.append(resultc)
df = pd.DataFrame({'Data': resultnConvert})
writer = pd.ExcelWriter('pandas_convert.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', header=False, index=False)
writer.save()
#resultFile.close()
print('Done.')
def snake(match):
return match.group(1).lower() + " " + match.group(2).lower()
resultn =[]
resultnConvert=[]
print('Writing results...')
#resultFile = open('test_all.csv', 'w')
for i, row in all.iterrows():
#results = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
resultc = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], convert(row['Column Physical_Name']) , row['Physical Data Type'], row['Null Option']])
resultnConvert.append(resultc)
df = pd.DataFrame({'Data': resultnConvert})
writer = pd.ExcelWriter('pandas_convert.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', header=False, index=False)
writer.save()
#resultFile.close()
print('Done.')
No comments:
Post a Comment