|
2 | 2 | import json |
3 | 3 | import random |
4 | 4 | from datetime import date |
| 5 | +from difflib import SequenceMatcher |
5 | 6 | from pathlib import Path |
6 | 7 |
|
7 | 8 | import folium |
@@ -1277,6 +1278,36 @@ def __get__percentage_of_metadata_version_1(df): |
1277 | 1278 | return len(df[df["metadata_version"] == 1]) / len(df) |
1278 | 1279 |
|
1279 | 1280 |
|
| 1281 | +def __get_similar_columns(df, column): |
| 1282 | + """ |
| 1283 | + Return a list of similar column values. For example, the "affiliation" column might include |
| 1284 | +
|
| 1285 | + ['University of California, Los Angeles', |
| 1286 | + 'University of California, Los Angeles (UCLA)', |
| 1287 | + 0.9135802469135802]] |
| 1288 | +
|
| 1289 | + """ |
| 1290 | + |
| 1291 | + df = df.dropna(subset=[column]) # drop null values |
| 1292 | + unique_values = df[column].unique() |
| 1293 | + |
| 1294 | + completed_pairs = [] |
| 1295 | + similar_pairs = [] |
| 1296 | + for value in unique_values: |
| 1297 | + for compare_value in [ |
| 1298 | + v for v in unique_values if v != value and (value, v) not in completed_pairs |
| 1299 | + ]: |
| 1300 | + similarity = SequenceMatcher( |
| 1301 | + None, value.lower(), compare_value.lower() |
| 1302 | + ).ratio() |
| 1303 | + if similarity > 0.85: |
| 1304 | + similar_pairs.append([value, compare_value, similarity]) |
| 1305 | + |
| 1306 | + completed_pairs.append((compare_value, value)) |
| 1307 | + |
| 1308 | + return similar_pairs |
| 1309 | + |
| 1310 | + |
1280 | 1311 | def __get__percentage_of_metadata_version_2(df): |
1281 | 1312 | """ |
1282 | 1313 | Calculates the percentage of rows in the DataFrame that have 'metadata_version' equal to 2. |
@@ -1353,27 +1384,29 @@ def report(): |
1353 | 1384 |
|
1354 | 1385 | return report |
1355 | 1386 |
|
| 1387 | + |
1356 | 1388 | def create_tree_map(frequency_dict, width, height): |
1357 | 1389 | """ |
1358 | | - Get a treemap of projects |
| 1390 | + Get a treemap of projects |
1359 | 1391 |
|
1360 | 1392 | Input parameter: dictionary |
1361 | 1393 | Output: treemap image |
1362 | 1394 | """ |
1363 | 1395 | labels = list(frequency_dict.keys()) |
1364 | 1396 | values = list(frequency_dict.values()) |
1365 | 1397 |
|
1366 | | - fig = go.Figure(go.Treemap( |
1367 | | - labels=labels, |
1368 | | - parents=[''] * len(labels), |
1369 | | - values=values, |
1370 | | - textinfo='label+value' |
1371 | | - )) |
| 1398 | + fig = go.Figure( |
| 1399 | + go.Treemap( |
| 1400 | + labels=labels, |
| 1401 | + parents=[""] * len(labels), |
| 1402 | + values=values, |
| 1403 | + textinfo="label+value", |
| 1404 | + ) |
| 1405 | + ) |
1372 | 1406 |
|
1373 | | - fig.update_layout(title='Projects', width=width, height=height) |
| 1407 | + fig.update_layout(title="Projects", width=width, height=height) |
1374 | 1408 |
|
1375 | 1409 | today = date.today() |
1376 | 1410 | output_path = f'treemap-{today.strftime("%Y%m%d")}.png' |
1377 | 1411 | fig.write_image(output_path) |
1378 | 1412 | fig.show() |
1379 | | - |
|
0 commit comments