Wednesday, October 2, 2019

Python Code - onenote lbm


---------------------- V9 Code in kaffle to run mixed case

Ver0
import pandas as pd
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f
= pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG
= r"(.+?)([A-Z])"
def snake(match):
   
return match.group(1).lower() + "_" + match.group(2).lower()
#df = pd.read_csv('../input/Accounts_Payable_csv.csv') # load data from csv
words
= all['Column Physical_Name']
results
= [re.sub(REG, snake, w, 0) for w in words]
print("\n".join(results))
print('Writing results...')
resultFile
= open('test.csv', 'w')
resultFile
.write(pprint.pformat(results))
resultFile
.close()
print('Done.')

Ver1
import pandas as pd
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
    return match.group(1).lower() + "_" + match.group(2).lower()
for i, row in all.iterrows():
 print(row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option'])

Ver3
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
    return match.group(1).lower() + "_" + match.group(2).lower()
for i, row in all.iterrows():
     ind= tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
     print(ind)

Ver4
f = pd.ExcelFile('../input/onesheet-dataset/Dimensions_Source_003.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
    return match.group(1).lower() + "_" + match.group(2).lower()
print('Writing results...')
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
#resultFile = open('test_all.csv', 'w')
for i, row in all.iterrows():
      df = pd.DataFrame({row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']})
     # results= tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
     # resultFile.write(pprint.pformat(results))
      df.to_excel(writer, sheet_name='Sheet1')
writer.save()
#resultFile.close()
print('Done.')
#print(row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option'])


------------- Final Version


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f = pd.ExcelFile('../input/new-version/Dimensions_Source_004.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
REG = r"(.+?)([A-Z])"
def snake(match):
    return match.group(1).lower() + "_" + match.group(2).lower()
resultn =[]
print('Writing results...')
#resultFile = open('test_all.csv', 'w')
for i, row in all.iterrows():
     #df = pd.DataFrame({row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']})
     results = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
     resultn.append(results)
     #df = pd.DataFrame({'Data': results})
     #resultFile.write(pprint.pformat(results))
df = pd.DataFrame({'Data': resultn})
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
#resultFile.close()
print('Done.')

Manual work in spreadsheet after it is prepared from the script above

Remove first row
remove first column
Replace all " ' "
Replace " ( " in first column
Replace " ) " in Last Column

---Better Function

def convert(name):
    s1 = re.sub(
'(.)([A-Z][a-z]+)', r'\1_\2', name)
   
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()



There's an inflection library in the package index that can handle these things for you. In this case, you'd be looking for inflection.underscore():
>>> inflection.underscore('CamelCase')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re,pprint
# read the file and parse the first and only sheet (need python xlrd module)
f
= pd.ExcelFile('../input/testcc2/Dimensions_Source_004.xlsx')
#all = f.parse(parse_cols=[2,3,4,5,6,7], skiprows=0, header=1)
#print(all)
all = f.parse("Bistrack")
def convert(name):
    s1
= re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
   
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
REG = r"(.+?)([A-Z])"
def snake(match):
   
return match.group(1).lower() + " " + match.group(2).lower()
resultn
=[]
resultnConvert
=[]
print('Writing results...')
#resultFile = open('test_all.csv', 'w')
for i, row in all.iterrows():   
    
#results = tuple([row['SubjectArea'], row['Source'], row['@Table Name'], re.sub(REG, snake, row['Column Physical_Name'], 0), row['Physical Data Type'], row['Null Option']])
     resultc
= tuple([row['SubjectArea'], row['Source'], row['@Table Name'], convert(row['Column Physical_Name']) , row['Physical Data Type'], row['Null Option']])
     resultnConvert
.append(resultc)         
df
= pd.DataFrame({'Data': resultnConvert})
writer
= pd.ExcelWriter('pandas_convert.xlsx', engine='xlsxwriter')
df
.to_excel(writer, sheet_name='Sheet1', header=False, index=False)
writer
.save()
#resultFile.close()
print('Done.')

No comments:

Post a Comment