74
74
< div data-md-component ="skip ">
75
75
76
76
77
- < a href ="#using-the-special-export-tool " class ="md-skip ">
77
+ < a href ="#importing-packages " class ="md-skip ">
78
78
Skip to content
79
79
</ a >
80
80
389
389
< ul class ="md-nav__list " data-md-component ="toc " data-md-scrollfix >
390
390
391
391
< li class ="md-nav__item ">
392
- < a href ="#using-the-special-export-tool " class ="md-nav__link ">
392
+ < a href ="#importing-packages " class ="md-nav__link ">
393
393
< span class ="md-ellipsis ">
394
- Using the Special Export Tool
394
+ Importing Packages
395
395
</ span >
396
396
</ a >
397
397
398
- < nav class ="md-nav " aria-label ="Using the Special Export Tool ">
399
- < ul class ="md-nav__list ">
400
-
401
- < li class ="md-nav__item ">
402
- < a href ="#examples " class ="md-nav__link ">
398
+ </ li >
399
+
400
+ < li class ="md-nav__item ">
401
+ < a href ="#using-the-special-export-tool " class ="md-nav__link ">
403
402
< span class ="md-ellipsis ">
404
- Examples
403
+ Using the Special Export Tool
405
404
</ span >
406
405
</ a >
407
406
408
- </ li >
409
-
410
- </ ul >
411
- </ nav >
412
-
413
407
</ li >
414
408
415
409
< li class ="md-nav__item ">
416
- < a href ="#using-the-requests-library " class ="md-nav__link ">
410
+ < a href ="#fetching-xml-data-with-requests " class ="md-nav__link ">
417
411
< span class ="md-ellipsis ">
418
- Using the requests Library
412
+ Fetching XML Data with requests
419
413
</ span >
420
414
</ a >
421
415
723
717
< ul class ="md-nav__list " data-md-component ="toc " data-md-scrollfix >
724
718
725
719
< li class ="md-nav__item ">
726
- < a href ="#using-the-special-export-tool " class ="md-nav__link ">
720
+ < a href ="#importing-packages " class ="md-nav__link ">
727
721
< span class ="md-ellipsis ">
728
- Using the Special Export Tool
722
+ Importing Packages
729
723
</ span >
730
724
</ a >
731
725
732
- < nav class ="md-nav " aria-label ="Using the Special Export Tool ">
733
- < ul class ="md-nav__list ">
734
-
735
- < li class ="md-nav__item ">
736
- < a href ="#examples " class ="md-nav__link ">
726
+ </ li >
727
+
728
+ < li class ="md-nav__item ">
729
+ < a href ="#using-the-special-export-tool " class ="md-nav__link ">
737
730
< span class ="md-ellipsis ">
738
- Examples
731
+ Using the Special Export Tool
739
732
</ span >
740
733
</ a >
741
734
742
- </ li >
743
-
744
- </ ul >
745
- </ nav >
746
-
747
735
</ li >
748
736
749
737
< li class ="md-nav__item ">
750
- < a href ="#using-the-requests-library " class ="md-nav__link ">
738
+ < a href ="#fetching-xml-data-with-requests " class ="md-nav__link ">
751
739
< span class ="md-ellipsis ">
752
- Using the requests Library
740
+ Fetching XML Data with requests
753
741
</ span >
754
742
</ a >
755
743
@@ -777,18 +765,21 @@ <h1>Special Export tool</h1>
777
765
778
766
< p > The < strong > Special Export</ strong > tool fetches specific pages with their raw content (< em > wikitext</ em > ) in real-time, without needing to download the entire dataset. The content is provided in XML format.</ p >
779
767
< div class ="toc "> < span class ="toctitle "> On this page</ span > < ul >
780
- < li > < a href ="#using-the-special-export-tool "> Using the Special Export Tool</ a > < ul >
781
- < li > < a href ="#examples "> Examples</ a > </ li >
782
- </ ul >
783
- </ li >
784
- < li > < a href ="#using-the-requests-library "> Using the requests Library</ a > </ li >
768
+ < li > < a href ="#importing-packages "> Importing Packages</ a > </ li >
769
+ < li > < a href ="#using-the-special-export-tool "> Using the Special Export Tool</ a > </ li >
770
+ < li > < a href ="#fetching-xml-data-with-requests "> Fetching XML Data with requests</ a > </ li >
785
771
</ ul >
786
772
</ div >
773
+ < h2 id ="importing-packages "> Importing Packages< a class ="headerlink " href ="#importing-packages " title ="Permanent link "> ¶</ a > </ h2 >
774
+ < div class ="language-python highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="kn "> import</ span > < span class ="nn "> requests</ span > < span class ="c1 "> # to fetch info from URLs</ span >
775
+ </ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
787
776
< h2 id ="using-the-special-export-tool "> Using the < strong > Special Export</ strong > Tool< a class ="headerlink " href ="#using-the-special-export-tool " title ="Permanent link "> ¶</ a > </ h2 >
788
777
< p > You can actually use < strong > Special:Export</ strong > to retrieve pages from < em > any</ em > Wiki site. On the German Wiktionary, however, the tool is labelled < strong > Spezial:Exportieren</ strong > , but it works the same way.</ p >
789
- < h3 id ="examples "> Examples< a class ="headerlink " href ="#examples " title ="Permanent link "> ¶</ a > </ h3 >
790
778
< p > < strong > Exporting Pages from Any Wiki Site</ strong > </ p >
791
779
< p > To access the XML content of the page titled "Austria" from English Wikipedia, you can use the following Python code. When you press < code > run</ code > , it will open the export link in your default browser:</ p >
780
+ < div class ="tabbed-set tabbed-alternate " data-tabs ="1:2 "> < input checked ="checked " id ="exec-2--__tabbed_1_1 " name ="exec-2--__tabbed_1 " type ="radio " /> < input id ="exec-2--__tabbed_1_2 " name ="exec-2--__tabbed_1 " type ="radio " /> < div class ="tabbed-labels "> < label for ="exec-2--__tabbed_1_1 "> Source</ label > < label for ="exec-2--__tabbed_1_2 "> Result</ label > </ div >
781
+ < div class ="tabbed-content ">
782
+ < div class ="tabbed-block ">
792
783
< div class ="language-python highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span >
793
784
< span class ="normal "> 2</ span >
794
785
< span class ="normal "> 3</ span >
@@ -797,21 +788,34 @@ <h3 id="examples">Examples<a class="headerlink" href="#examples" title="Permanen
797
788
< span class ="n "> url</ span > < span class ="o "> =</ span > < span class ="sa "> f</ span > < span class ="s1 "> 'https://</ span > < span class ="si "> {</ span > < span class ="n "> domain</ span > < span class ="si "> }</ span > < span class ="s1 "> /wiki/Special:Export/</ span > < span class ="si "> {</ span > < span class ="n "> title</ span > < span class ="si "> }</ span > < span class ="s1 "> '</ span >
798
789
< span class ="nb "> print</ span > < span class ="p "> (</ span > < span class ="n "> url</ span > < span class ="p "> )</ span >
799
790
</ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
791
+ </ div >
792
+ < div class ="tabbed-block ">
800
793
< div class ="language-pycon highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python Console Session</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="go "> https://en.wikipedia.org/wiki/Special:Export/Austria</ span >
801
794
</ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
795
+ </ div >
796
+ </ div >
797
+ </ div >
802
798
< p > < strong > Exporting Pages from the German Wiktionary</ strong > </ p >
803
- < p > For the German Wiktionary, the export tool uses < code > Spezial:Exportieren</ code > instead of < code > Special:Export</ code > . You can use similar Python code to open the export link for the page titled "schön" (German for "beautiful"):</ p >
799
+ < p > For the German Wiktionary, the export tool uses < code > Spezial:Exportieren</ code > instead of < code > Special:Export</ code > . You can use similar Python code to open the export link for the page titled "hoch" (German for "high"):</ p >
800
+ < div class ="tabbed-set tabbed-alternate " data-tabs ="1:2 "> < input checked ="checked " id ="exec-3--__tabbed_1_1 " name ="exec-3--__tabbed_1 " type ="radio " /> < input id ="exec-3--__tabbed_1_2 " name ="exec-3--__tabbed_1 " type ="radio " /> < div class ="tabbed-labels "> < label for ="exec-3--__tabbed_1_1 "> Source</ label > < label for ="exec-3--__tabbed_1_2 "> Result</ label > </ div >
801
+ < div class ="tabbed-content ">
802
+ < div class ="tabbed-block ">
804
803
< div class ="language-python highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span >
805
804
< span class ="normal "> 2</ span >
806
805
< span class ="normal "> 3</ span >
807
- < span class ="normal "> 4</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="n "> title</ span > < span class ="o "> =</ span > < span class ="s1 "> 'schön '</ span >
806
+ < span class ="normal "> 4</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="n "> title</ span > < span class ="o "> =</ span > < span class ="s1 "> 'hoch '</ span >
808
807
< span class ="n "> domain</ span > < span class ="o "> =</ span > < span class ="s1 "> 'de.wiktionary.org'</ span >
809
808
< span class ="n "> url</ span > < span class ="o "> =</ span > < span class ="sa "> f</ span > < span class ="s1 "> 'https://</ span > < span class ="si "> {</ span > < span class ="n "> domain</ span > < span class ="si "> }</ span > < span class ="s1 "> /wiki/Spezial:Exportieren/</ span > < span class ="si "> {</ span > < span class ="n "> title</ span > < span class ="si "> }</ span > < span class ="s1 "> '</ span >
810
809
< span class ="nb "> print</ span > < span class ="p "> (</ span > < span class ="n "> url</ span > < span class ="p "> )</ span >
811
810
</ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
812
- < div class ="language-pycon highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python Console Session</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="go "> https://de.wiktionary.org/wiki/Spezial:Exportieren/schön</ span >
811
+ </ div >
812
+ < div class ="tabbed-block ">
813
+ < div class ="language-pycon highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python Console Session</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="go "> https://de.wiktionary.org/wiki/Spezial:Exportieren/hoch</ span >
813
814
</ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
814
- < h2 id ="using-the-requests-library "> Using the < code > requests</ code > Library< a class ="headerlink " href ="#using-the-requests-library " title ="Permanent link "> ¶</ a > </ h2 >
815
+ </ div >
816
+ </ div >
817
+ </ div >
818
+ < h2 id ="fetching-xml-data-with-requests "> Fetching XML Data with < code > requests</ code > < a class ="headerlink " href ="#fetching-xml-data-with-requests " title ="Permanent link "> ¶</ a > </ h2 >
815
819
< p > To programmatically fetch and download XML content, you can use Python's < code > requests</ code > library. This example shows how to build the URL, make a request, and get the XML content of a Wiktionary page by its title.</ p >
816
820
< div class ="language-python highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span >
817
821
< span class ="normal "> 2</ span >
@@ -824,11 +828,7 @@ <h2 id="using-the-requests-library">Using the <code>requests</code> Library<a cl
824
828
< span class ="normal "> 9</ span >
825
829
< span class ="normal "> 10</ span >
826
830
< span class ="normal "> 11</ span >
827
- < span class ="normal "> 12</ span >
828
- < span class ="normal "> 13</ span >
829
- < span class ="normal "> 14</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="kn "> import</ span > < span class ="nn "> requests</ span >
830
-
831
- < span class ="k "> def</ span > < span class ="nf "> fetch</ span > < span class ="p "> (</ span > < span class ="n "> title</ span > < span class ="p "> ):</ span >
831
+ < span class ="normal "> 12</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="k "> def</ span > < span class ="nf "> fetch</ span > < span class ="p "> (</ span > < span class ="n "> title</ span > < span class ="p "> ):</ span >
832
832
< span class ="c1 "> # Construct the URL for the XML export of the given page title</ span >
833
833
< span class ="n "> url</ span > < span class ="o "> =</ span > < span class ="sa "> f</ span > < span class ="s1 "> 'https://de.wiktionary.org/wiki/Spezial:Exportieren/</ span > < span class ="si "> {</ span > < span class ="n "> title</ span > < span class ="si "> }</ span > < span class ="s1 "> '</ span >
834
834
@@ -839,10 +839,10 @@ <h2 id="using-the-requests-library">Using the <code>requests</code> Library<a cl
839
839
< span class ="n "> resp</ span > < span class ="o "> .</ span > < span class ="n "> raise_for_status</ span > < span class ="p "> ()</ span >
840
840
841
841
< span class ="c1 "> # Return the XML content of the requested page</ span >
842
- < span class ="k "> return</ span > < span class ="n "> resp</ span > < span class ="o "> .</ span > < span class ="n "> content </ span >
842
+ < span class ="k "> return</ span > < span class ="n "> resp</ span > < span class ="o "> .</ span > < span class ="n "> text </ span >
843
843
</ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
844
- < p > Next, let us attempt to retrieve the XML content for the page titled "hoch" and print the initial 500 bytes for a glimpse of the XML content displayed in the < code > Result </ code > tab .</ p >
845
- < div class ="tabbed-set tabbed-alternate " data-tabs ="1:2 "> < input checked ="checked " id ="exec-4 --__tabbed_1_1 " name ="exec-4 --__tabbed_1 " type ="radio " /> < input id ="exec-4 --__tabbed_1_2 " name ="exec-4 --__tabbed_1 " type ="radio " /> < div class ="tabbed-labels "> < label for ="exec-4 --__tabbed_1_1 "> Source</ label > < label for ="exec-4 --__tabbed_1_2 "> Result</ label > </ div >
844
+ < p > Next, let us attempt to retrieve the XML content for the page titled "hoch" and print the initial 500 bytes for a glimpse of the XML content.</ p >
845
+ < div class ="tabbed-set tabbed-alternate " data-tabs ="1:2 "> < input checked ="checked " id ="exec-5 --__tabbed_1_1 " name ="exec-5 --__tabbed_1 " type ="radio " /> < input id ="exec-5 --__tabbed_1_2 " name ="exec-5 --__tabbed_1 " type ="radio " /> < div class ="tabbed-labels "> < label for ="exec-5 --__tabbed_1_1 "> Source</ label > < label for ="exec-5 --__tabbed_1_2 "> Result</ label > </ div >
846
846
< div class ="tabbed-content ">
847
847
< div class ="tabbed-block ">
848
848
< div class ="language-python highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span >
@@ -851,7 +851,21 @@ <h2 id="using-the-requests-library">Using the <code>requests</code> Library<a cl
851
851
</ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
852
852
</ div >
853
853
< div class ="tabbed-block ">
854
- < div class ="language-pycon highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python Console Session</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="go "> b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="de">\n <siteinfo>\n <sitename>Wiktionary</sitename>\n <dbname>dewiktionary</dbname>\n <base>https://de.wiktionary.org/wiki/Wiktionary:Hauptseite</base>\n <generator>MediaWiki 1.44.0-wmf.16</generator>\n <case>case-sensitive</case>\n <namesp'</ span >
854
+ < div class ="language-pycon highlight "> < table class ="highlighttable "> < tr > < th colspan ="2 " class ="filename "> < span class ="filename "> Python Console Session</ span > </ th > </ tr > < tr > < td class ="linenos "> < div class ="linenodiv "> < pre > < span > </ span > < span class ="normal "> 1</ span >
855
+ < span class ="normal "> 2</ span >
856
+ < span class ="normal "> 3</ span >
857
+ < span class ="normal "> 4</ span >
858
+ < span class ="normal "> 5</ span >
859
+ < span class ="normal "> 6</ span >
860
+ < span class ="normal "> 7</ span >
861
+ < span class ="normal "> 8</ span > </ pre > </ div > </ td > < td class ="code "> < div > < pre > < span > </ span > < code > < span class ="go "> <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="de"></ span >
862
+ < span class ="go "> <siteinfo></ span >
863
+ < span class ="go "> <sitename>Wiktionary</sitename></ span >
864
+ < span class ="go "> <dbname>dewiktionary</dbname></ span >
865
+ < span class ="go "> <base>https://de.wiktionary.org/wiki/Wiktionary:Hauptseite</base></ span >
866
+ < span class ="go "> <generator>MediaWiki 1.44.0-wmf.17</generator></ span >
867
+ < span class ="go "> <case>case-sensitive</case></ span >
868
+ < span class ="go "> <namesp</ span >
855
869
</ code > </ pre > </ div > </ td > </ tr > </ table > </ div >
856
870
</ div >
857
871
</ div >
0 commit comments