|
117 | 117 | },
|
118 | 118 | {
|
119 | 119 | "cell_type": "code",
|
120 |
| - "execution_count": 49, |
| 120 | + "execution_count": null, |
121 | 121 | "metadata": {},
|
122 | 122 | "outputs": [],
|
123 | 123 | "source": [
|
|
127 | 127 | "def generate_elements_from_lines(filename: str) -> Iterator[dict[str, str]]:\n",
|
128 | 128 | "\n",
|
129 | 129 | " def get_attrs(line: str) -> dict[str, str]:\n",
|
130 |
| - " (_, attrs) = line.split(\"<row \", 2)\n", |
| 130 | + " _, attrs = line.split(\"<row \", 2)\n", |
131 | 131 | " return {m.group(1): m.group(2)\n",
|
132 | 132 | " for m in re.finditer(r\"(\\w*?)=\\\"(.*?)\\\"\", attrs)}\n",
|
133 | 133 | "\n",
|
|
172 | 172 | "metadata": {},
|
173 | 173 | "outputs": [],
|
174 | 174 | "source": [
|
175 |
| - "all_attrs_posts = get_all_attrs(generate_elements_from_lines(\"Posts.xml\"))\n", |
| 175 | + "all_attrs_posts: set[str] = get_all_attrs(generate_elements_from_lines(\"Posts.xml\"))\n", |
176 | 176 | "all_attrs_posts"
|
177 | 177 | ]
|
178 | 178 | },
|
|
200 | 200 | "metadata": {},
|
201 | 201 | "outputs": [],
|
202 | 202 | "source": [
|
203 |
| - "all_attrs_posts = id_as_first_attribute(all_attrs_posts, 'Id')\n", |
| 203 | + "all_attrs_posts: list[str] = id_as_first_attribute(all_attrs_posts, 'Id')\n", |
204 | 204 | "all_attrs_posts"
|
205 | 205 | ]
|
206 | 206 | },
|
|
231 | 231 | },
|
232 | 232 | {
|
233 | 233 | "cell_type": "code",
|
234 |
| - "execution_count": 55, |
| 234 | + "execution_count": null, |
235 | 235 | "metadata": {},
|
236 | 236 | "outputs": [],
|
237 | 237 | "source": [
|
|
246 | 246 | "\n",
|
247 | 247 | " # Recorrer el iterador\n",
|
248 | 248 | " for row in iterator:\n",
|
249 |
| - " row_to_write = [row.get(att, '') for att in all_attrs]\n", |
| 249 | + " row_to_write: list[str] = [row.get(att, '') for att in all_attrs]\n", |
250 | 250 | " cw.writerow(row_to_write)"
|
251 | 251 | ]
|
252 | 252 | },
|
|
261 | 261 | },
|
262 | 262 | {
|
263 | 263 | "cell_type": "code",
|
264 |
| - "execution_count": 57, |
| 264 | + "execution_count": null, |
265 | 265 | "metadata": {},
|
266 | 266 | "outputs": [],
|
267 | 267 | "source": [
|
268 | 268 | "# Write the df dataframe to parquet file\n",
|
269 |
| - "df = pd.read_csv('Posts.csv', encoding='utf-8', header=0,\n", |
| 269 | + "from pandas import DataFrame\n", |
| 270 | + "\n", |
| 271 | + "df: DataFrame = pd.read_csv('Posts.csv', encoding='utf-8', header=0,\n", |
270 | 272 | " dtype={'Id': 'Int64', 'PostTypeId': 'Int64', 'AcceptedAnswerId': 'Int64', 'ParentId': 'Int64',\n",
|
271 | 273 | " 'Score': 'Int64', 'ViewCount': 'Int64',\n",
|
272 | 274 | " 'Body': pd.StringDtype(), 'OwnerUserId': 'Int64', 'OwnerDisplayName': pd.StringDtype(),\n",
|
|
301 | 303 | "metadata": {},
|
302 | 304 | "outputs": [],
|
303 | 305 | "source": [
|
304 |
| - "all_attrs_votes = get_all_attrs(generate_elements_from_lines(\"Votes.xml\"))\n", |
| 306 | + "all_attrs_votes: set[str] = get_all_attrs(generate_elements_from_lines(\"Votes.xml\"))\n", |
305 | 307 | "all_attrs_votes"
|
306 | 308 | ]
|
307 | 309 | },
|
|
311 | 313 | "metadata": {},
|
312 | 314 | "outputs": [],
|
313 | 315 | "source": [
|
314 |
| - "all_attrs_votes = id_as_first_attribute(all_attrs_votes, 'Id')\n", |
| 316 | + "all_attrs_votes: list[str] = id_as_first_attribute(all_attrs_votes, 'Id')\n", |
315 | 317 | "all_attrs_votes"
|
316 | 318 | ]
|
317 | 319 | },
|
|
326 | 328 | },
|
327 | 329 | {
|
328 | 330 | "cell_type": "code",
|
329 |
| - "execution_count": 63, |
| 331 | + "execution_count": null, |
330 | 332 | "metadata": {},
|
331 | 333 | "outputs": [],
|
332 | 334 | "source": [
|
333 | 335 | "# Write the df dataframe to parquet file\n",
|
334 |
| - "df = pd.read_csv('Votes.csv', encoding='utf-8', header=0,\n", |
| 336 | + "from pandas import DataFrame\n", |
| 337 | + "\n", |
| 338 | + "df: DataFrame = pd.read_csv('Votes.csv', encoding='utf-8', header=0,\n", |
335 | 339 | " dtype={'Id': 'Int64', 'VoteTypeId' : 'Int64', 'BountyAmount' : 'Int64', 'PostId': 'Int64', 'UserId' : 'Int64' },\n",
|
336 | 340 | " parse_dates=['CreationDate'])"
|
337 | 341 | ]
|
|
360 | 364 | "metadata": {},
|
361 | 365 | "outputs": [],
|
362 | 366 | "source": [
|
363 |
| - "all_attrs_tags = get_all_attrs(generate_elements_from_lines(\"Tags.xml\"))\n", |
| 367 | + "all_attrs_tags: set[str] = get_all_attrs(generate_elements_from_lines(\"Tags.xml\"))\n", |
364 | 368 | "all_attrs_tags"
|
365 | 369 | ]
|
366 | 370 | },
|
|
370 | 374 | "metadata": {},
|
371 | 375 | "outputs": [],
|
372 | 376 | "source": [
|
373 |
| - "all_attrs_tags = id_as_first_attribute(all_attrs_tags, 'Id')\n", |
| 377 | + "all_attrs_tags: list[str] = id_as_first_attribute(all_attrs_tags, 'Id')\n", |
374 | 378 | "all_attrs_tags"
|
375 | 379 | ]
|
376 | 380 | },
|
|
385 | 389 | },
|
386 | 390 | {
|
387 | 391 | "cell_type": "code",
|
388 |
| - "execution_count": 69, |
| 392 | + "execution_count": null, |
389 | 393 | "metadata": {},
|
390 | 394 | "outputs": [],
|
391 | 395 | "source": [
|
392 | 396 | "# Write the df dataframe to parquet file\n",
|
393 |
| - "df = pd.read_csv('Tags.csv', encoding='utf-8', header=0,\n", |
| 397 | + "df: DataFrame = pd.read_csv('Tags.csv', encoding='utf-8', header=0,\n", |
394 | 398 | " dtype={'Id': 'Int64',\n",
|
395 | 399 | " 'Count' : 'Int64',\n",
|
396 | 400 | " 'TagName' : pd.StringDtype(),\n",
|
|
432 | 436 | "metadata": {},
|
433 | 437 | "outputs": [],
|
434 | 438 | "source": [
|
435 |
| - "all_attrs_users = get_all_attrs(generate_elements_from_lines(\"Users.xml\"))\n", |
| 439 | + "all_attrs_users: set[str] = get_all_attrs(generate_elements_from_lines(\"Users.xml\"))\n", |
436 | 440 | "all_attrs_users"
|
437 | 441 | ]
|
438 | 442 | },
|
|
442 | 446 | "metadata": {},
|
443 | 447 | "outputs": [],
|
444 | 448 | "source": [
|
445 |
| - "all_attrs_users = id_as_first_attribute(all_attrs_users, 'Id')\n", |
| 449 | + "all_attrs_users: list[str] = id_as_first_attribute(all_attrs_users, 'Id')\n", |
446 | 450 | "all_attrs_users"
|
447 | 451 | ]
|
448 | 452 | },
|
|
466 | 470 | },
|
467 | 471 | {
|
468 | 472 | "cell_type": "code",
|
469 |
| - "execution_count": 77, |
| 473 | + "execution_count": null, |
470 | 474 | "metadata": {},
|
471 | 475 | "outputs": [],
|
472 | 476 | "source": [
|
473 | 477 | "# Write the df dataframe to parquet file\n",
|
474 |
| - "df = pd.read_csv('Users.csv', encoding='utf-8', header=0,\n", |
| 478 | + "df: DataFrame = pd.read_csv('Users.csv', encoding='utf-8', header=0,\n", |
475 | 479 | " dtype={'Id': 'Int64',\n",
|
476 | 480 | " 'DisplayName': pd.StringDtype(),\n",
|
477 | 481 | " 'Location' : pd.StringDtype(),\n",
|
|
518 | 522 | "metadata": {},
|
519 | 523 | "outputs": [],
|
520 | 524 | "source": [
|
521 |
| - "all_attrs_comments = get_all_attrs(generate_elements_from_lines(\"Comments.xml\"))\n", |
| 525 | + "all_attrs_comments: set[str] = get_all_attrs(generate_elements_from_lines(\"Comments.xml\"))\n", |
522 | 526 | "all_attrs_users"
|
523 | 527 | ]
|
524 | 528 | },
|
|
528 | 532 | "metadata": {},
|
529 | 533 | "outputs": [],
|
530 | 534 | "source": [
|
531 |
| - "all_attrs_comments = id_as_first_attribute(all_attrs_comments, 'Id')\n", |
| 535 | + "all_attrs_comments: list[str] = id_as_first_attribute(all_attrs_comments, 'Id')\n", |
532 | 536 | "all_attrs_comments"
|
533 | 537 | ]
|
534 | 538 | },
|
|
543 | 547 | },
|
544 | 548 | {
|
545 | 549 | "cell_type": "code",
|
546 |
| - "execution_count": 84, |
| 550 | + "execution_count": null, |
547 | 551 | "metadata": {},
|
548 | 552 | "outputs": [],
|
549 | 553 | "source": [
|
550 | 554 | "# Write the df dataframe to parquet file\n",
|
551 |
| - "df = pd.read_csv('Comments.csv', encoding='utf-8', header=0,\n", |
| 555 | + "df: DataFrame = pd.read_csv('Comments.csv', encoding='utf-8', header=0,\n", |
552 | 556 | " dtype={'Id': 'Int64',\n",
|
553 | 557 | " 'ContentLicense' : pd.StringDtype(),\n",
|
554 | 558 | " 'PostId' : 'Int64',\n",
|
|
604 | 608 | "name": "python",
|
605 | 609 | "nbconvert_exporter": "python",
|
606 | 610 | "pygments_lexer": "ipython3",
|
607 |
| - "version": "3.12.7" |
| 611 | + "version": "3.13.3" |
608 | 612 | }
|
609 | 613 | },
|
610 | 614 | "nbformat": 4,
|
|
0 commit comments