Skip to content

Commit 908a951

Browse files
committed
Review de csv to parquet.
1 parent 7fd6e15 commit 908a951

File tree

1 file changed

+29
-25
lines changed

1 file changed

+29
-25
lines changed

addendum/parquet_convert/csv_to_parquet.ipynb

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@
117117
},
118118
{
119119
"cell_type": "code",
120-
"execution_count": 49,
120+
"execution_count": null,
121121
"metadata": {},
122122
"outputs": [],
123123
"source": [
@@ -127,7 +127,7 @@
127127
"def generate_elements_from_lines(filename: str) -> Iterator[dict[str, str]]:\n",
128128
"\n",
129129
" def get_attrs(line: str) -> dict[str, str]:\n",
130-
" (_, attrs) = line.split(\"<row \", 2)\n",
130+
" _, attrs = line.split(\"<row \", 2)\n",
131131
" return {m.group(1): m.group(2)\n",
132132
" for m in re.finditer(r\"(\\w*?)=\\\"(.*?)\\\"\", attrs)}\n",
133133
"\n",
@@ -172,7 +172,7 @@
172172
"metadata": {},
173173
"outputs": [],
174174
"source": [
175-
"all_attrs_posts = get_all_attrs(generate_elements_from_lines(\"Posts.xml\"))\n",
175+
"all_attrs_posts: set[str] = get_all_attrs(generate_elements_from_lines(\"Posts.xml\"))\n",
176176
"all_attrs_posts"
177177
]
178178
},
@@ -200,7 +200,7 @@
200200
"metadata": {},
201201
"outputs": [],
202202
"source": [
203-
"all_attrs_posts = id_as_first_attribute(all_attrs_posts, 'Id')\n",
203+
"all_attrs_posts: list[str] = id_as_first_attribute(all_attrs_posts, 'Id')\n",
204204
"all_attrs_posts"
205205
]
206206
},
@@ -231,7 +231,7 @@
231231
},
232232
{
233233
"cell_type": "code",
234-
"execution_count": 55,
234+
"execution_count": null,
235235
"metadata": {},
236236
"outputs": [],
237237
"source": [
@@ -246,7 +246,7 @@
246246
"\n",
247247
" # Recorrer el iterador\n",
248248
" for row in iterator:\n",
249-
" row_to_write = [row.get(att, '') for att in all_attrs]\n",
249+
" row_to_write: list[str] = [row.get(att, '') for att in all_attrs]\n",
250250
" cw.writerow(row_to_write)"
251251
]
252252
},
@@ -261,12 +261,14 @@
261261
},
262262
{
263263
"cell_type": "code",
264-
"execution_count": 57,
264+
"execution_count": null,
265265
"metadata": {},
266266
"outputs": [],
267267
"source": [
268268
"# Write the df dataframe to parquet file\n",
269-
"df = pd.read_csv('Posts.csv', encoding='utf-8', header=0,\n",
269+
"from pandas import DataFrame\n",
270+
"\n",
271+
"df: DataFrame = pd.read_csv('Posts.csv', encoding='utf-8', header=0,\n",
270272
" dtype={'Id': 'Int64', 'PostTypeId': 'Int64', 'AcceptedAnswerId': 'Int64', 'ParentId': 'Int64',\n",
271273
" 'Score': 'Int64', 'ViewCount': 'Int64',\n",
272274
" 'Body': pd.StringDtype(), 'OwnerUserId': 'Int64', 'OwnerDisplayName': pd.StringDtype(),\n",
@@ -301,7 +303,7 @@
301303
"metadata": {},
302304
"outputs": [],
303305
"source": [
304-
"all_attrs_votes = get_all_attrs(generate_elements_from_lines(\"Votes.xml\"))\n",
306+
"all_attrs_votes: set[str] = get_all_attrs(generate_elements_from_lines(\"Votes.xml\"))\n",
305307
"all_attrs_votes"
306308
]
307309
},
@@ -311,7 +313,7 @@
311313
"metadata": {},
312314
"outputs": [],
313315
"source": [
314-
"all_attrs_votes = id_as_first_attribute(all_attrs_votes, 'Id')\n",
316+
"all_attrs_votes: list[str] = id_as_first_attribute(all_attrs_votes, 'Id')\n",
315317
"all_attrs_votes"
316318
]
317319
},
@@ -326,12 +328,14 @@
326328
},
327329
{
328330
"cell_type": "code",
329-
"execution_count": 63,
331+
"execution_count": null,
330332
"metadata": {},
331333
"outputs": [],
332334
"source": [
333335
"# Write the df dataframe to parquet file\n",
334-
"df = pd.read_csv('Votes.csv', encoding='utf-8', header=0,\n",
336+
"from pandas import DataFrame\n",
337+
"\n",
338+
"df: DataFrame = pd.read_csv('Votes.csv', encoding='utf-8', header=0,\n",
335339
" dtype={'Id': 'Int64', 'VoteTypeId' : 'Int64', 'BountyAmount' : 'Int64', 'PostId': 'Int64', 'UserId' : 'Int64' },\n",
336340
" parse_dates=['CreationDate'])"
337341
]
@@ -360,7 +364,7 @@
360364
"metadata": {},
361365
"outputs": [],
362366
"source": [
363-
"all_attrs_tags = get_all_attrs(generate_elements_from_lines(\"Tags.xml\"))\n",
367+
"all_attrs_tags: set[str] = get_all_attrs(generate_elements_from_lines(\"Tags.xml\"))\n",
364368
"all_attrs_tags"
365369
]
366370
},
@@ -370,7 +374,7 @@
370374
"metadata": {},
371375
"outputs": [],
372376
"source": [
373-
"all_attrs_tags = id_as_first_attribute(all_attrs_tags, 'Id')\n",
377+
"all_attrs_tags: list[str] = id_as_first_attribute(all_attrs_tags, 'Id')\n",
374378
"all_attrs_tags"
375379
]
376380
},
@@ -385,12 +389,12 @@
385389
},
386390
{
387391
"cell_type": "code",
388-
"execution_count": 69,
392+
"execution_count": null,
389393
"metadata": {},
390394
"outputs": [],
391395
"source": [
392396
"# Write the df dataframe to parquet file\n",
393-
"df = pd.read_csv('Tags.csv', encoding='utf-8', header=0,\n",
397+
"df: DataFrame = pd.read_csv('Tags.csv', encoding='utf-8', header=0,\n",
394398
" dtype={'Id': 'Int64',\n",
395399
" 'Count' : 'Int64',\n",
396400
" 'TagName' : pd.StringDtype(),\n",
@@ -432,7 +436,7 @@
432436
"metadata": {},
433437
"outputs": [],
434438
"source": [
435-
"all_attrs_users = get_all_attrs(generate_elements_from_lines(\"Users.xml\"))\n",
439+
"all_attrs_users: set[str] = get_all_attrs(generate_elements_from_lines(\"Users.xml\"))\n",
436440
"all_attrs_users"
437441
]
438442
},
@@ -442,7 +446,7 @@
442446
"metadata": {},
443447
"outputs": [],
444448
"source": [
445-
"all_attrs_users = id_as_first_attribute(all_attrs_users, 'Id')\n",
449+
"all_attrs_users: list[str] = id_as_first_attribute(all_attrs_users, 'Id')\n",
446450
"all_attrs_users"
447451
]
448452
},
@@ -466,12 +470,12 @@
466470
},
467471
{
468472
"cell_type": "code",
469-
"execution_count": 77,
473+
"execution_count": null,
470474
"metadata": {},
471475
"outputs": [],
472476
"source": [
473477
"# Write the df dataframe to parquet file\n",
474-
"df = pd.read_csv('Users.csv', encoding='utf-8', header=0,\n",
478+
"df: DataFrame = pd.read_csv('Users.csv', encoding='utf-8', header=0,\n",
475479
" dtype={'Id': 'Int64',\n",
476480
" 'DisplayName': pd.StringDtype(),\n",
477481
" 'Location' : pd.StringDtype(),\n",
@@ -518,7 +522,7 @@
518522
"metadata": {},
519523
"outputs": [],
520524
"source": [
521-
"all_attrs_comments = get_all_attrs(generate_elements_from_lines(\"Comments.xml\"))\n",
525+
"all_attrs_comments: set[str] = get_all_attrs(generate_elements_from_lines(\"Comments.xml\"))\n",
522526
"all_attrs_users"
523527
]
524528
},
@@ -528,7 +532,7 @@
528532
"metadata": {},
529533
"outputs": [],
530534
"source": [
531-
"all_attrs_comments = id_as_first_attribute(all_attrs_comments, 'Id')\n",
535+
"all_attrs_comments: list[str] = id_as_first_attribute(all_attrs_comments, 'Id')\n",
532536
"all_attrs_comments"
533537
]
534538
},
@@ -543,12 +547,12 @@
543547
},
544548
{
545549
"cell_type": "code",
546-
"execution_count": 84,
550+
"execution_count": null,
547551
"metadata": {},
548552
"outputs": [],
549553
"source": [
550554
"# Write the df dataframe to parquet file\n",
551-
"df = pd.read_csv('Comments.csv', encoding='utf-8', header=0,\n",
555+
"df: DataFrame = pd.read_csv('Comments.csv', encoding='utf-8', header=0,\n",
552556
" dtype={'Id': 'Int64',\n",
553557
" 'ContentLicense' : pd.StringDtype(),\n",
554558
" 'PostId' : 'Int64',\n",
@@ -604,7 +608,7 @@
604608
"name": "python",
605609
"nbconvert_exporter": "python",
606610
"pygments_lexer": "ipython3",
607-
"version": "3.12.7"
611+
"version": "3.13.3"
608612
}
609613
},
610614
"nbformat": 4,

0 commit comments

Comments
 (0)