1、读取excel:使用xlrd库

1
2
3
4
5
6
7
8
9
10
11
from urllib.parse import urlparse
import xlrd
file = "./网址.xls"
workbook = xlrd.open_workbook(filename=file)
table = workbook.sheet_by_name("Sheet1")
domain_set = set()
for i in range(table.nrows):
url = table.row_values(i)[0]
domain_no_http_www = urlparse(url).netloc.replace("www.", "")
domain_set.add(domain_no_http_www)
print(domain_set, len(domain_set))

注:如果出现 xlrd.biffh.XLRDError: Excel xlsx file; not supported,这是由于当前python中的xlrd版本过高导致的,需要降低版本

1
pip3 install xlrd==1.2.0

2、写入excel库:使用xlwt库

1
2
3
4
5
6
7
8
9
10
import xlwt
book = xlwt.Workbook()
sheet1 = book.add_sheet("Sheet1")
row = 0
for domain in domain_set:
if domain not in db_set:
print(domain)
sheet1.write(row, 0, domain)
row += 1
book.save("./库中没有的domain.xls")

3、写入excel:使用XlsxWriter(只能写入.xlsx)

1
2
3
4
5
6
7
8
9
10
11
12
import xlsxwriter
header = ["title","url"]
data = [["百度","baidu.com"],["搜狐","souhu.com"]]
workbook = xlsxwriter.Workbook("./test.xlsx")
sheet1 = workbook.add_worksheet("sheet1")
sheet1.activate()
sheet1.write_row("A1", header) # 写入行只能A1,A2...作为行
row = 2
for per_data_list in data:
sheet1.write_row(f"A{row}", per_data_list)
row += 1
workbook.close()

4、合并单元格

1
2
3
4
5
6
7
8
9
10
11
import pandas as pd
from openpyxl import load_workbook
df = pd.read_excel("./湖南监管网从业人员.xlsx", sheet_name=0)
wb = load_workbook('./湖南监管网从业人员.xlsx')
ws = wb[wb.sheetnames[0]]
company_dict = df.groupby("公司名称").groups
for company, rows_list in company_dict.items():
print(company, rows_list, rows_list[0], rows_list[-1])
ws.merge_cells(start_row=rows_list[0]+2, end_row=rows_list[-1]+2, start_column=1, end_column=1)
ws.merge_cells(start_row=rows_list[0]+2, end_row=rows_list[-1]+2, start_column=2, end_column=2)
wb.save('./湖南监管网从业人员2.xlsx')

5、追加写入多个sheet(不会删除原有的sheet)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
from openpyxl import Workbook, load_workbook
if not os.path.exists("./test.xlsx"):
wb_w = Workbook()
wb_w.save("./test.xlsx")

wb = load_workbook("./test.xlsx")
ws = wb.create_sheet('这是表1')
ws.append(["姓名", "年龄", "性别"])
datas = [["张三", 93, "男"], ["小丽", 15, "女"], ["吴日新", 45, "男"]]
for row in datas:
ws.append(row)

ws = wb.create_sheet('这是表2222')
ws.append(["姓名", "年龄", "性别"])
datas = [["张三", 93, "男"], ["小丽", 15, "女"], ["吴日新", 45, "男"]]
for row in datas:
ws.append(row)

try: # 表默认带一个sheet,如果空表没有sheet会打不开,但是自己新建了sheet,这个就可以删除了
del wb["Sheet"]
except:
pass

wb.save('./test.xlsx')