
需求:2 个 list(alist,blist),alist 每个值与 blist 每个值做字符串相似度计算,两个 list 数量级为 20 万 下面为 python 和 go 的代码片段 python:
# 计算两个字符串的相似度 def similar(a, b): """计算两个字符串的相似度。如果有一个是 None ,则返回 0 。""" if a is None or b is None: return 0 similarity = fuzz.token_set_ratio(a, b.lower()) / 100 print("similar a:", a, ",", "b:", b, ", similarity:", similarity) return similarity def compute_similarity(args): record_name, name = args return similar(record_name, name), name # 更新数据库记录 def update_database(cursor, name_mapping, csv_data): update_sql = "UPDATE tweb_fingerprint_test SET factory = %s WHERE uuid = %s" num = 0 # 创建一个反向映射,使我们可以快速地通过名称查找 UUID name_to_uuid = defaultdict(list) for uuid, names in name_mapping.items(): for name in names: if name: # 检查 name 是否为 None 或空 name_to_uuid[name].append(uuid) updates = [] with ProcessPoolExecutor() as executor: for row in csv_data: vendor_name = row.get("vendor") record_name = row.get("name") print("record_name:", record_name) if ( vendor_name is None or record_name is None or vendor_name in ["未知", "None"] ): continue # 跳过这行数据 # 直接查找名称 uuids_to_update = name_to_uuid.get(record_name, []) # 如果没有直接匹配,尝试查找相似度超过 98%的名称 if not uuids_to_update: tasks = [(record_name, name) for name in name_to_uuid] results = executor.map(compute_similarity, tasks) uuids_to_update.extend( name_to_uuid[name] for similarity, name in results if similarity > 0.98 ) # 如果找到 UUID ,加入到更新列表中 for uuid_to_update in uuids_to_update: updates.append((vendor_name, uuid_to_update)) # 批量更新 if updates: cursor.executemany(update_sql, updates) num = len(updates) # 返回更新的记录数 return num go:
// Similar calculates the similarity between two strings func Similar(a, b string) float64 { return smetrics.JaroWinkler(a, b, 0.7, 4) } // UpdateDatabase updates the database with the new vendor information // UpdateDatabase updates the database with the new vendor information func UpdateDatabase(db *sql.DB, vendors map[string]Vendor, records []CSVRecord) (int, error) { fmt.Println("records", len(records)) fmt.Println("vendors", len(vendors)) stmt, err := db.Prepare("UPDATE tweb_fingerprint SET factory = ? WHERE uuid = ?") if err != nil { return 0, err } defer stmt.Close() var wg sync.WaitGroup updates := make(chan Updatedata, len(records)) for _, record := range records { wg.Add(1) go func(record CSVRecord) { defer wg.Done() // fmt.Println(record.Name) for _, vendor := range vendors { if record.Name == vendor.Name.String || Similar(record.Name, vendor.Name.String) > SimilarityThreshold { updates <- Updatedata{ UUID: vendor.UUI, Factory: record.Vendor, } } } }(record) } go func() { wg.Wait() close(updates) }() count := 0 for update := range updates { fmt.Println("update:", update) if _, err := stmt.Exec(update.Factory, update.UUID); err != nil { return count, err } count++ } return count, nil } 1 Baloneo 2023 年 12 月 13 日 快多少? |