
ezachte at svn
Apr 30, 2012, 9:52 AM
Post #1 of 1
(37 views)
Permalink
|
|
SVN: [115089] trunk/wikistats/squids
|
|
https://www.mediawiki.org/wiki/Special:Code/MediaWiki/115089 Revision: 115089 Author: ezachte Date: 2012-04-30 16:52:33 +0000 (Mon, 30 Apr 2012) Log Message: ----------- some pl and Config.pm files adapted to new folder hierarchy on stat1 Viz* files added (generate input for animation) Modified Paths: -------------- trunk/wikistats/squids/SquidCountryScan.pl trunk/wikistats/squids/SquidCountryScanConfig.pm trunk/wikistats/squids/SquidLoadScan.pl trunk/wikistats/squids/SquidReportArchive.pl trunk/wikistats/squids/SquidReportArchive.sh trunk/wikistats/squids/SquidReportArchiveConfig.pm Added Paths: ----------- trunk/wikistats/squids/SquidTraceUniqueImages.pl trunk/wikistats/squids/VizCollectEvents.pl trunk/wikistats/squids/VizCollectEventsMonth.pl trunk/wikistats/squids/VizPrepJs.pl Modified: trunk/wikistats/squids/SquidCountryScan.pl =================================================================== --- trunk/wikistats/squids/SquidCountryScan.pl 2012-04-30 12:39:01 UTC (rev 115088) +++ trunk/wikistats/squids/SquidCountryScan.pl 2012-04-30 16:52:33 UTC (rev 115089) @@ -1,3 +1,4 @@ + #!/usr/bin/perl ## Collect page views stats by country on Locke ## sub CollectRawData -> SquidDataCountries.csv @@ -23,22 +24,26 @@ # exit ; } - $path_root = $job_runs_on_production_server ? $cfg_path_root_production : $cfg_path_root_test ; + $path_csv = $job_runs_on_production_server ? $cfg_path_csv : $cfg_path_csv_test ; + $path_log = $job_runs_on_production_server ? $cfg_path_log : $cfg_path_log_test ; + $file_log = "SquidCountryScan.log" ; - $file_raw_data_monthly_visits = "$path_root/SquidDataVisitsPerCountryMonthly.csv" ; - $file_raw_data_daily_visits = "$path_root/SquidDataVisitsPerCountryDaily.csv" ; + $file_raw_data_monthly_visits = "$path_csv/SquidDataVisitsPerCountryMonthly.csv" ; + $file_raw_data_daily_visits = "$path_csv/SquidDataVisitsPerCountryDaily.csv" ; $file_per_country_visits = "public/SquidDataCountriesViews.csv" ; $file_per_country_visits_old = "SquidDataCountries2.csv" ; - $file_raw_data_monthly_saves = "$path_root/SquidDataSavesPerCountryMonthly.csv" ; - $file_raw_data_daily_saves = "$path_root/SquidDataSavesPerCountryDaily.csv" ; + $file_raw_data_monthly_saves = "$path_csv/SquidDataSavesPerCountryMonthly.csv" ; + $file_raw_data_daily_saves = "$path_csv/SquidDataSavesPerCountryDaily.csv" ; $file_per_country_saves = "public/SquidDataCountriesSaves.csv" ; $file_per_country_saves_old = "SquidDataCountriesSaves.csv" ; &CollectRawData ('visits', $file_per_country_visits, $file_per_country_visits_old, $file_raw_data_monthly_visits, $file_raw_data_daily_visits) ; &CollectRawData ('saves', $file_per_country_saves, $file_per_country_saves_old, $file_raw_data_monthly_saves, $file_raw_data_daily_saves) ; - &ProcessRawData ; +# &ProcessRawData ; + print "\n\nReady\n\n" ; + exit ; sub CollectRawData @@ -60,7 +65,7 @@ while ($true) { - $dir = "$path_root/" . sprintf ("%04d-%02d", $year, $month) ; + $dir = "$path_csv/" . sprintf ("%04d-%02d", $year, $month) ; $yyyymm = sprintf ("%04d-%02d", $year, $month) ; if (-d $dir) { @@ -243,6 +248,7 @@ } } +# not operational, obsolete? Q&D code? sub ProcessRawData { print "\nProcessRawData\n\n" ; @@ -457,7 +463,6 @@ { $ratio = sprintf ("%5.1f", $count_edits / $count_submits) ; } $text .= sprintf ("%-14s",'total') . "edits " . sprintf ("%6d", $count_edits) . ", submits ". sprintf ("%6d", $count_submits) . ", ratio $ratio\n" ; $text .= "\n\n" ; - print $count $text .= "Count per relevant status with redlink:\n" ; foreach $key (sort keys %counts_per_relevant_status_with_redlink) @@ -469,7 +474,7 @@ } $text .= "\n\n" ; - open SUMMARY, '>', $file_txt_summary ; + open SUMMARY, '>', "$path_log/$file_log" ; print SUMMARY $text ; close SUMMARY ; @@ -489,3 +494,10 @@ my $days = ($timegm2-$timegm1) / (24*60*60) ; return ($days) ; } + +sub Log +{ + my $msg = shift ; + print $msg ; +} + Modified: trunk/wikistats/squids/SquidCountryScanConfig.pm =================================================================== --- trunk/wikistats/squids/SquidCountryScanConfig.pm 2012-04-30 12:39:01 UTC (rev 115088) +++ trunk/wikistats/squids/SquidCountryScanConfig.pm 2012-04-30 16:52:33 UTC (rev 115089) @@ -1,7 +1,11 @@ #!/usr/bin/perl - $cfg_liblocation = "/home/ezachte/lib" ; + $cfg_liblocation = "/a/squid/stats/scripts" ; - $cfg_path_root_production = "/a/ezachte/" ; - $cfg_path_root_test = "w:/! perl/squids/archive/" ; # Erik -# $cfg_path_root_test = "?" ; # Andr\xE9 + $cfg_path_csv = "/a/squid/stats/csv/" ; + $cfg_path_csv_test = "w:/! perl/squids/archive/" ; # Erik +# $cfg_path_csv_test = "?" ; # Andr\xE9 + + $cfg_path_log = "/a/squid/stats/scripts/" ; + $cfg_path_log_test = "w:/! perl/squids/archive/" ; # Erik +# $cfg_path_log_test = "?" ; # Andr\xE9 Modified: trunk/wikistats/squids/SquidLoadScan.pl =================================================================== --- trunk/wikistats/squids/SquidLoadScan.pl 2012-04-30 12:39:01 UTC (rev 115088) +++ trunk/wikistats/squids/SquidLoadScan.pl 2012-04-30 16:52:33 UTC (rev 115089) @@ -104,6 +104,7 @@ { $avg_delta_all_regular_squids = sprintf ("%.0f", $all_regular_squids_delta_hour {$date_hour} / $all_regular_squids_active {$date_hour}) ; print CSV "$date_hour,$avg_delta_all_regular_squids\n" ; + print "$date_hour,$avg_delta_all_regular_squids\n" ; } close CSV ; } Modified: trunk/wikistats/squids/SquidReportArchive.pl =================================================================== --- trunk/wikistats/squids/SquidReportArchive.pl 2012-04-30 12:39:01 UTC (rev 115088) +++ trunk/wikistats/squids/SquidReportArchive.pl 2012-04-30 16:52:33 UTC (rev 115089) @@ -1,5 +1,7 @@ #!/usr/bin/perl + $| = 1; # Flush output + use SquidReportArchiveConfig ; use lib $cfg_liblocation ; @@ -32,18 +34,20 @@ undef %country_code_not_specified_reported ; - $path_in = $job_runs_on_production_server ? $cfg_path_in_production : $cfg_path_in_test ; - $path_out = $job_runs_on_production_server ? $cfg_path_out_production : $cfg_path_out_test ; + $path_csv = $job_runs_on_production_server ? $cfg_path_csv : $cfg_path_csv_test ; + $path_reports = $job_runs_on_production_server ? $cfg_path_reports : $cfg_path_reports_test ; + $path_log = $job_runs_on_production_server ? $cfg_path_log : $cfg_path_log_test ; - &Log ("Path in = $path_in\n") ; - &Log ("Path out = $path_out\n") ; + &Log ("Path csv = $path_csv\n") ; + &Log ("Path reports = $path_reports\n") ; + &Log ("Path log = $path_log\n") ; # following test needs to change -> remove server name dependency (new run argument ?) # elsif ($hostname eq 'bayes') # { # &Log ("\n\nJob runs on server $hostname\n\n") ; -# $path_in = "/home/ezachte/wikistats/animation" ; -# $path_out = "/home/ezachte/wikistats/animation" ; +# $path_csv = "/home/ezachte/wikistats/animation" ; +# $path_reports = "/home/ezachte/wikistats/animation" ; # } $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ; @@ -52,28 +56,66 @@ # 'http://en.wikipedia.org/wiki/List_of_countries_by_population' # 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users' if (defined ($options {"w"})) - { &ReadWikipedia ; &Log ("Ready\n") ; exit ; } + { + &ReadWikipedia ; + &Log ("\n\nReady\n\n") ; + exit ; + } + elsif (defined ($options {"c"})) + { + $reportcountries = $true ; + &Log ("\nGenerate report per country\n\n") ; - if (defined ($options {"c"})) - { $reportcountries = $true ; } + if (defined ($options {"q"})) + { + $quarter_only = $options {"q"} ; # process for this quarter only + if ($quarter_only !~ /^2\d\d\dQ\d$/) + { abort ("Specify run for one single quarter as -q yyyyQ[1-4], e.g. -q 2011Q3, not '$quarter_only'\n") ; } + $quarter_only =~ s/^(\d\d\d\d)(Q\d)$/$1 $2/ ; + &Log ("\nRun for one quarter only: $quarter_only\n\n") ; + } + } + elsif (defined ($options {"m"}) || defined ($options {"d"})) + { + if (($options {"m"} !~ /^\d\d\d\d-\d\d$/) && ($options {"d"} !~ /^-\d+$/)) + { &Log ("Specify month as -m yyyy-mm or days back as -d -[days] (e.g. -d -1 for yesterday)") ; exit ; } - if (defined ($options {"q"})) - { - $quarter_only = $options {"q"} ; # process for this quarter only - if ($quarter_only !~ /^2\d\d\dQ\d$/) - { abort ("Specify run for one single quarter as -q yyyyQ[1-4], e.g. -q 2011Q3, not '$quarter_only'\n") ; } - $quarter_only =~ s/^(\d\d\d\d)(Q\d)$/$1 $2/ ; - &Log ("QUARTER ONLY $quarter_only\n") ; + $reportdaysback = $options {"d"} ; + $reportmonth = $options {"m"} ; + + if ($reportdaysback =~ /^-\d+$/) + { + ($sec,$min,$hour,$day,$month,$year) = localtime (time+$reportdaysback*86400) ; + $reportmonth = sprintf ("%04d-%02d",$year+1900,$month+1) ; + } + + &Log ("Report month = $reportmonth\n") ; } + else { &Log ("No valid run option found. Specify -c [-q ..]| -m ..| -d ..| -w") ; exit ; } + # date range used to be read from csv file with ReadDate, now there are daily csv files # if earlier methods still is useful it needs to be tweaked # if (($reportmonth ne "") && ($reportmonth !~ /^\d{6}$/)) + if ($quarter_only ne '') + { $path_reports = "$path_reports/$quarter_only" ; } + elsif ($reportmonth ne '') + { $path_reports = "$path_reports/$reportmonth" ; } + elsif ($reportcountries) + { $path_reports = "$path_reports/countries" ; } + + print "Write report to $path_reports\n" ; + + $path_reports =~ s/ /-/g ; + if (! -d $path_reports) + { + # print "mkdir $path_reports\n" ; + mkdir ($path_reports) || die "Unable to create directory $path_reports\n" ; + } + &InitProjectNames ; - $file_csv_country_codes = "CountryCodes.csv" ; - &ReadInputCountriesNames ; if ($reportcountries) @@ -90,25 +132,12 @@ exit ; } - $reportdaysback = $options {"d"} ; - $reportmonth = $options {"m"} ; - - if (($reportmonth !~ /^\d\d\d\d-\d\d$/) && ($reportdaysback !~ /^-\d+$/)) - { &Log ("Specify month as -m yyyy-mm or days back as -d -[days] (e.g. -d -1 for yesterday)") ; exit ; } - - if ($reportdaysback =~ /^-\d+$/) - { - ($sec,$min,$hour,$day,$month,$year) = localtime (time+$reportdaysback*86400) ; - $reportmonth = sprintf ("%04d-%02d",$year+1900,$month+1) ; - } - &Log ("Report month = $reportmonth\n") ; - $days_in_month = &DaysInMonth (substr ($reportmonth,0,4), substr ($reportmonth,5,2)) ; $threshold_mime = 0 ; $threshold_project = 10 ; - $file_log = "WikiReportsSampledVisitorsLog.log" ; + $file_log = "SquidReportArchive.log" ; $file_html_crawlers = "SquidReportCrawlers.htm" ; $file_html_methods = "SquidReportMethods.htm" ; @@ -124,6 +153,7 @@ $file_html_clients_html = "SquidReportClientsHtmlOnly.htm" ; $file_html_countries_info = "SquidReportCountryData.htm" ; + $file_csv_user_agents = "SquidReportUserAgents.csv" ; # names till 2010-07-01 # # $file_csv_crawlers = "SquidDataCrawlers.csv" ; @@ -161,8 +191,8 @@ print "\n\nJob SquidReportArchive.pl\n\n" ; - if (! -d "$path_in/$reportmonth") - { print "Directory not found: $path_in\/$reportmonth\n" ; exit ; } + if (! -d "$path_csv/$reportmonth") + { print "Directory not found: $path_csv\/$reportmonth\n" ; exit ; } # for ($month = 4 ; $month <= 10 ; $month ++) # { @@ -173,7 +203,7 @@ # last if ($month == 10) && ($day > 24) # temp code stay with DST summer time zone for SV $date = $reportmonth . "-". sprintf ("%02d", $day) ; - $dir = "$path_in/$reportmonth/$date" ; + $dir = "$path_csv/$reportmonth/$date" ; if (-d $dir) { @@ -195,9 +225,6 @@ if ($#dirs_process < 0) { print "No valid data to process.\n" ; exit ; } - $path_reports = "$path_in/$reportmonth" ; - print "Write report to $path_reports\n" ; - $google_ip_ranges = "<b>IP ranges:</b> known ip ranges for Google are 64.233.[160.0-191.255], 66.249.[64.0-95.255], 66.102.[0.0-15.255], 72.14.[192.0-255.255], <br>74.125.[0.0-255.255], " . "209.085.[128.0-255.255], 216.239.[32.0-63.255] and a few minor other subranges</small><p>\n" ; @@ -211,6 +238,7 @@ { $days_input_found ++ ; + print "\nRead input from $path_process\n" ; &ReadInputClients ; &ReadInputCrawlers ; &ReadInputMethods ; @@ -347,9 +375,9 @@ $file_csv_per_country_overview = "SquidReport${selection}PerCountryOverview.csv" ; $file_csv_per_country_density = "SquidReport${selection}PerCountryDensity.csv" ; - $path_csv_squid_counts_monthly = "$path_in/$file_csv_squid_counts_monthly" ; + $path_csv_squid_counts_monthly = "$path_csv/$file_csv_squid_counts_monthly" ; if (! -e $path_csv_squid_counts_monthly) { abort ("Input file $path_csv_squid_counts_monthly not found!") ; } - $path_csv_squid_counts_daily = "$path_in/$file_csv_squid_counts_daily" ; + $path_csv_squid_counts_daily = "$path_csv/$file_csv_squid_counts_daily" ; if (! -e $path_csv_squid_counts_daily) { abort ("Input file $path_csv_squid_counts_daily not found!") ; } &ReadInputCountriesMonthly ($project_mode) ; @@ -567,7 +595,7 @@ { &Log ("ReadCountryCodes\n") ; - open CODES, '<', "$path_in/$file_csv_country_codes" ; + open CODES, '<', "$path_csv/$file_csv_country_codes" ; while ($line = <CODES>) { if ($line =~ /^[A-Z]/) @@ -1364,7 +1392,8 @@ { &Log ("ReadInputCountriesNames\n") ; - $path_csv_country_codes = "$path_in/$file_csv_country_codes" ; + $file_csv_country_codes = "CountryCodes.csv" ; + $path_csv_country_codes = "$path_csv/$file_csv_country_codes" ; if (! -e $path_csv_country_codes) { abort ("Input file $path_csv_country_codes not found!") ; } open CSV_COUNTRY_CODES, '<', $path_csv_country_codes ; @@ -1409,13 +1438,12 @@ # http://en.wikipedia.org/wiki/List_of_countries_by_population # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users - &Log ("Read $path_in/$file_csv_country_meta_info\n") ; - open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ; + &Log ("Read $path_csv/$file_csv_country_meta_info\n") ; + open COUNTRY_META_INFO, '<', "$path_csv/$file_csv_country_meta_info" ; binmode COUNTRY_META_INFO ; while ($line = <COUNTRY_META_INFO>) { chomp $line ; - $line =~ s/[\x00-\x1f]//g ; ($country,$link,$population,$connected,$icon) = split ',', $line ; @@ -1447,6 +1475,7 @@ if ($connected eq 'connected') { &Log ("connected unknown: $country\n") ; } + $connected =~ s/connected/../g ; $country_meta_info {$country} = "$link,$population,$connected,$icon" ; @@ -1466,6 +1495,7 @@ { $country_name = $country_names {$country_code} ; $country_meta = $country_meta_info {$country_name} ; + my ($link,$population,$connected,$icon) = split (',', $country_meta) ; $region_code = $region_codes {$country_code} ; @@ -1948,6 +1978,22 @@ my $file_csv = "$path_process/$file_csv_countries_info" ; if (! -e $file_csv) { abort ("Function ReadInputCountryInfo: file $file_csv not found!!!") ; } + #$allcountrytotal = 0 ; + #$countrytotal = { } ; + #$allcountrybrowser = { } ; + #$countrybrowser = { } ; + #$allcountryos = { } ; + #$countryos = { } ; + #$allcountrymobile = 0 ; + #$countrymobile = { } ; + undef $allcountrytotal ; + undef %countrytotal ; + undef %allcountrybrowser ; + undef %countrybrowser ; + undef %allcountryos ; + undef %countryos ; + undef $allcountrymobile ; + undef %countrymobile ; open CSV_COUNTRIES_INFO, '<', $file_csv ; while ($line = <CSV_COUNTRIES_INFO>) { @@ -4848,7 +4894,10 @@ $altbgcolor = '#DDFFDD' ; open FILE_HTML_USER_AGENTS, '>', "$path_reports/$file_html_user_agents" ; + open FILE_CSV_USER_AGENTS, '>', "$path_reports/$file_csv_user_agents" ; + $csv_out = "# user agents lay-out\n" ; + $csv_out = "# pageviews total, pageviews mobile, pageviews main, opensearch, all total, all mobile, all main, all other\n" ; $html = $header ; $html =~ s/TITLE/Wikimedia Traffic Analysis Report - User Agent Overview/ ; $html =~ s/HEADER/Wikimedia Traffic Analysis Report - User Agent Overview/ ; @@ -4857,7 +4906,7 @@ $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; $html .= "<table border=1>\n" ; - + $html .= "<tr><th class=l valign='top' rowspan=2 colspan=4> </th><th rowspan=16> </th><th class=c colspan=5>Page views</th><th rowspan=16> </th><th class=c colspan=5>All requests</th><th rowspan=16> </th></tr>\n" ; $html .= "<tr><th class=c>Total</th><th class=c>Percentage</th><th class=c>To mobile</th><th class=c>To main site</th><th class=c>Search-based estimate<a href='#explain_search'>[1]</a></th>" ; $html .= "<th class=c>Total</th><th class=c>Percentage</th><th class=c>To mobile</th><th class=c>To main site</th><th class=c>To other servers<a href='#explain_other'>[2]</a></th></tr>\n" ; @@ -4915,7 +4964,7 @@ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Data per Country/ ; $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Data per Country/ ; $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; - $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $dummy_user_agents \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ; + $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_user_agents \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ; $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; $html .= "<table border=1 width=800>\n" ; @@ -5175,10 +5224,10 @@ sub WriteCsvCountriesTimed { - &Log ("WriteCsvCountriesTimed: $path_out/$file_csv_countries_timed\n") ; + &Log ("WriteCsvCountriesTimed: $path_csv/$file_csv_countries_timed\n") ; $multiplier_1000 = 1000 * $multiplier ; - open CSV_COUNTRIES_TIMED, '>', "$path_out/$file_csv_countries_timed" ; + open CSV_COUNTRIES_TIMED, '>', "$path_csv/$file_csv_countries_timed" ; foreach $target (sort keys %targets) { @@ -5230,9 +5279,9 @@ # http://www.maxmind.com/app/iso3166 country codes sub WriteCsvCountriesGoTo { - &Log ("WriteCsvCountriesGoTo: $path_out/$file_csv_countries_languages_visited\n") ; + &Log ("WriteCsvCountriesGoTo: $path_csv/$file_csv_countries_languages_visited\n") ; - open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "$path_out/$file_csv_countries_languages_visited" ; + open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "$path_csv/$file_csv_countries_languages_visited" ; foreach $country (sort keys %countries) { @@ -5398,7 +5447,7 @@ $index = &HtmlIndex (join '/ ', sort (@index_languages)) ; $html =~ s/INDEX/$index/ ; - &PrintHtml ($html, "$path_out/$file_html_per_language_breakdown") ; + &PrintHtml ($html, "$path_reports/$file_html_per_language_breakdown") ; } sub WriteReportPerCountryOverview @@ -5689,7 +5738,7 @@ $html =~ s/TOTAL/$html_total/ ; $html =~ s/REGIONS/$html_regions/ ; - &PrintHtml ($html, "$path_out/$file_html_per_country_overview") ; + &PrintHtml ($html, "$path_reports/$file_html_per_country_overview") ; } #sub WriteReportPerCountryOverviewLine @@ -5845,7 +5894,7 @@ # $file_csv_per_country_overview2 = $file_csv_per_country_overview ; # $file_csv_per_country_overview2 =~ s/\.csv/-$postfix.csv/ ; - &PrintCsv ($header_csv_countries . join ('', sort @csv_countries), "$path_out/$file_csv_per_country_density") ; + &PrintCsv ($header_csv_countries . join ('', sort @csv_countries), "$path_csv/$file_csv_per_country_density") ; } sub WriteCsvSvgFilePerCountryOverview @@ -6022,7 +6071,7 @@ $file_csv_per_country_overview2 = $file_csv_per_country_overview ; $file_csv_per_country_overview2 =~ s/\.csv/-$postfix.csv/ ; - &PrintCsv ($header_csv_countries . join ('', sort @csv_countries), "$path_out/svg/$file_csv_per_country_overview2") ; + &PrintCsv ($header_csv_countries . join ('', sort @csv_countries), "$path_csv/svg/$file_csv_per_country_overview2") ; # $perc_tot = 0 ; # foreach $code (keys_sorted_by_value_num_desc %requests_per_connected_persons) @@ -6386,9 +6435,9 @@ $html =~ s/INDEX/$index/ ; if (! $show_logcount) - { &PrintHtml ($html, "$path_out/$file_html_per_country_breakdown") ; } + { &PrintHtml ($html, "$path_reports/$file_html_per_country_breakdown") ; } else - { &PrintHtml ($html, "$path_out/$file_html_per_country_breakdown_huge") ; } + { &PrintHtml ($html, "$path_reports/$file_html_per_country_breakdown_huge") ; } } sub WriteReportPerCountryTrends @@ -6527,7 +6576,7 @@ $index = &HtmlIndex (join '/ ', sort (@index_countries)) ; $html =~ s/INDEX/$index/ ; - &PrintHtml ($html, "$path_out/$file_html_per_country_trends") ; + &PrintHtml ($html, "$path_reports/$file_html_per_country_trends") ; } sub CorrectForMissingDays @@ -6681,7 +6730,7 @@ # close "FILE_LOG" ; # } # open "FILE_LOG", ">>", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; - open "FILE_LOG", ">>", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; + open "FILE_LOG", ">>", "$path_log/$file_log" || abort ("Log file '$file_log' could not be opened.") ; &Log ("\n\n===== Wikimedia Sampled Visitors Log Report / " . date_time_english (time) . " =====\n\n") ; } @@ -7200,8 +7249,8 @@ { &Log ("$country\n") ; } } - &Log ("Write $path_in/$file_csv_country_meta_info\n\n") ; # use $path_in, not $path_out so that next step picks up proper file - open COUNTRY_META_INFO, '>', "$path_in/$file_csv_country_meta_info" ; + &Log ("Write $path_csv/$file_csv_country_meta_info\n\n") ; + open COUNTRY_META_INFO, '>', "$path_csv/$file_csv_country_meta_info" ; foreach $country (sort keys %countries) { print COUNTRY_META_INFO $countries {$country} ; } close COUNTRY_META_INFO ; Modified: trunk/wikistats/squids/SquidReportArchive.sh =================================================================== --- trunk/wikistats/squids/SquidReportArchive.sh 2012-04-30 12:39:01 UTC (rev 115088) +++ trunk/wikistats/squids/SquidReportArchive.sh 2012-04-30 16:52:33 UTC (rev 115089) @@ -1,24 +1,33 @@ #! /bin/sh ulimit -v 4000000 -home="/a/ezachte" -month=2011-08 +month=2012-01 +quarter=2012Q1 -# perl $home/SquidReportArchive.pl -m 201007 > SquidReportArchiveLog.txt +stats=/a/squid/stats +scripts=$stats/scripts +cd $scripts + +# perl $scripts/SquidReportArchive.pl -m 201007 > SquidReportArchiveLog.txt # after further automating SquidScanCountries.sh -# perl $home/SquidCountryScan.pl # start in July 2009 -# perl $home/SquidReportArchive.pl -c # >> SquidReportArchiveLog.txt # -c for per country reports -# perl $home/SquidReportArchive.pl -c -q 2010Q2 # >> SquidReportArchiveLog.txt # -c for per country reports -perl $home/SquidReportArchive.pl -m $month # >> SquidReportArchiveLog.txt +# perl SquidCountryScan.pl # collect csv data for all months, start in July 2009 +# perl SquidReportArchive.pl -c # >> SquidReportArchiveLog.txt # -c for per country reports +# perl SquidReportArchive.pl -c -q $quarter # >> SquidReportArchiveLog.txt # -c for per country reports +# perl SquidReportArchive.pl -m $month # >> SquidReportArchiveLog.txt +perl SquidReportArchive.pl -m 2011-10 # >> SquidReportArchiveLog.txt +perl SquidReportArchive.pl -m 2011-11 # >> SquidReportArchiveLog.txt +perl SquidReportArchive.pl -m 2011-12 # >> SquidReportArchiveLog.txt +perl SquidReportArchive.pl -m 2012-01 # >> SquidReportArchiveLog.txt +perl SquidReportArchive.pl -m 2012-02 # >> SquidReportArchiveLog.txt +perl SquidReportArchive.pl -m 2012-03 # >> SquidReportArchiveLog.txt + -ls -l /a/ezachte/reports*$month* -rm /a/ezachte/reports*$month* - -tar -cf /a/ezachte/$month/$month-html.tar /a/ezachte/$month/*.htm -cp /a/ezachte/$month/$month-html.tar ./reports-traffic-$month.tar +exit +tar -cf $stats/$month/$month-html.tar $reports/$month/*.htm +cp $reports/$month/$month-html.tar ./reports-traffic-$month.tar tar -cf reports-countries-$month.tar SquidReportPage*.htm bzip2 -f reports-traffic-$month.tar bzip2 -f reports-countries-$month.tar tar -cf reports-$month.tar reports-*-$month.tar.bz2 -rm /a/ezachte/reports*$month*.bz2 +rm $reports/reports*$month*.bz2 Modified: trunk/wikistats/squids/SquidReportArchiveConfig.pm =================================================================== --- trunk/wikistats/squids/SquidReportArchiveConfig.pm 2012-04-30 12:39:01 UTC (rev 115088) +++ trunk/wikistats/squids/SquidReportArchiveConfig.pm 2012-04-30 16:52:33 UTC (rev 115089) @@ -1,16 +1,20 @@ #!/usr/bin/perl - $cfg_liblocation = "/home/ezachte/lib" ; + $cfg_liblocation = "/a/squid/stats/scripts" ; - $cfg_path_in_production = "/a/ezachte" ; - $cfg_path_out_production = "/a/ezachte" ; -# $cfg_path_in_test = "W:/# Out Locke" ; # Erik -# $cfg_path_out_test = "W:/# Out Test/Locke" ; # Erik - $cfg_path_in_test = "/srv/erik/" ; # Andr\xE9 - $cfg_path_out_test = "/srv/erik/" ; # Andr\xE9 + $cfg_path_csv = "/a/squid/stats/csv" ; + $cfg_path_reports = "/a/squid/stats/reports" ; + $cfg_path_log = "/a/squid/stats/scripts" ; + + $cfg_path_csv_test = "W:/# Out Locke" ; # Erik + $cfg_path_reports_test = "W:/# Out Test/Locke" ; # Erik + $cfg_path_log_test = "W:/# Out Test/Locke" ; # Erik +# $cfg_path_csv_test = "/srv/erik/" ; # Andr\xE9 +# $cfg_path_reports_test = "/srv/erik/" ; # Andr\xE9 + $cfg_path_log_test = "/srv/erik/" ; # Andr\xE9 # set default arguments for test on local machine -# $cfg_default_argv = "-m 2011-07" ; # monthly report + $cfg_default_argv = "-m 2011-08" ; # monthly report # $cfg_default_argv = "-w" ; # refresh country info from Wikipedia (population etc) # $cfg_default_argv = "-c" ; # country/regional reports - $cfg_default_argv = "-c -q 2011Q4" ; # country/regional reports based on data for one quarter only +# $cfg_default_argv = "-c -q 2011Q4" ; # country/regional reports based on data for one quarter only Added: trunk/wikistats/squids/SquidTraceUniqueImages.pl =================================================================== --- trunk/wikistats/squids/SquidTraceUniqueImages.pl (rev 0) +++ trunk/wikistats/squids/SquidTraceUniqueImages.pl 2012-04-30 16:52:33 UTC (rev 115089) @@ -0,0 +1,83 @@ +#!/usr/bin/perl + + $max_days_ago = 100 ; + $time_start = time ; + + open CSV_OUT, '>', '/a/ezachte/SquidDataTrendUniqueImages.csv' ; + print CSV_OUT ",unique files,,,,unique images\n" ; + print CSV_OUT "date,count,delta,,date,count,delta\n" ; + + for ($days_ago = $max_days_ago ; $days_ago > 0 ; $days_ago --) + { + ($day,$month,$year) = (localtime ($time_start - 3600 * 24 * $days_ago))[3,4,5]; + $month++ ; + $year+=1900 ; + $yyyy_mm = sprintf ("%04d-%02d", $year, $month) ; + $yyyy_mm_dd = sprintf ("%04d-%02d-%02d", $year, $month, $day) ; + $date_excel = "\"=DATE($year,$month,$day)\"" ; + + # print "$days_ago days ago -> $yyyy_mm_dd\n" ; + + $file = "/a/ezachte/$yyyy_mm/$yyyy_mm_dd/public/SquidDataBinaries.csv" ; + + if (! -e $file) + { print "No file $file\n" ; next } + + print "Process $file\n" ; + + open CSV_IN, '<', $file ; + while ($line = <CSV_IN>) + { + chomp $line ; + + next if $line =~ /^#/ ; + next if $line =~ /^:/ ; + + if ($line =~ /,.*,/) # forgot to encode comma's in image name + { + $line =~ s/,([^,]*)$/#?#?#$1/; + $line =~ s/,/%47/g; + $line =~ s/\#\?\#\?\#/,/; + } + + if ($line =~ /,.*,/) # not fixed ? + { + $line =~ s/,([^,]*)$/^#^$1/; + $line =~ s/,/%47/g; + print "\nSkip $line\n" ; + next ; + } + + ($file,$count) = split (',', $line) ; + + if ($files {$file} == 0) + { + $unique_files++ ; + $files {$file} += $count ; + } + + # print "1 $file\n" ; + $file =~ s/^.*\/\d\/\d\w\/// ; + # print "2 $file\n" ; + $file =~ s/\/.*$// ; + # print "3 $file\n" ; + + if ($images {$file} == 0) + { + $unique_images++ ; + $images {$file} += $count ; + } + + } + + $delta_files = $unique_files - $unique_files_prev ; + $delta_images = $unique_images - $unique_images_prev ; + + print "$days_ago,$date_excel,$unique_files,$unique_images\n" ; + print CSV_OUT "$date_excel,$unique_files,$delta_files,,$date_excel,$unique_images,$delta_images\n" ; + + $unique_files_prev = $unique_files ; + $unique_images_prev = $unique_images ; + } + + Added: trunk/wikistats/squids/VizCollectEvents.pl =================================================================== --- trunk/wikistats/squids/VizCollectEvents.pl (rev 0) +++ trunk/wikistats/squids/VizCollectEvents.pl 2012-04-30 16:52:33 UTC (rev 115089) @@ -0,0 +1,223 @@ +#!/usr/bin/perl + + use Time::Local ; + use Compress::Zlib; + use Getopt::Std ; + + my $options ; + getopt ("d", \%options) ; + $date = $options {"d"} ; + + die "Specify date as yyyy/mm/dd" if $date !~ /^\d\d\d\d\/\d\d\/\d\d$/ ; + ($year,$month,$day) = split ('\/', $date) ; + + $date1 = sprintf ("%04d%02d%02d", $year, $month, $day) ; + $time = timegm (0,0,0,$day,$month-1,$year-1900) ; + ($sec,$min,$hour,$day2,$month2,$year2) = gmtime ($time+24*3600) ; + $date2 = sprintf ("%04d%02d%02d", $year2+1900, $month2+1, $day2) ; + + if (-d "/a/ezachte") + { + $dir_in = "/a/squid/archive" ; + $dir_out = "/a/ezachte" ; + } + else + { + print "Test on Windows\n" ; + use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib + use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib + + $dir_in = "." ; + $dir_out = "." ; + } + + $dir_out .= "/" . sprintf ("%04d-%02d", $year, $month) ; + + if (! -d $dir_out) + { + print "mkdir $dir_out\n" ; + mkdir ($dir_out) || die "Unable to create directory $dir_out\n" ; + } + + $dir_out .= "/" . sprintf ("%04d-%02d-%02d", $year, $month, $day) ; + if (! -d $dir_out) + { + print "mkdir $dir_out\n" ; + mkdir ($dir_out) || die "Unable to create directory $dir_out\n" ; + print "mkdir $dir_out/private\n" ; + mkdir ("$dir_out/private") || die "Unable to create directory $dir_out/private\n" ; + print "mkdir $dir_out/public\n" ; + mkdir ("$dir_out/public" ) || die "Unable to create directory $dir_out/public\n" ; + } + + &CollectEdits ($date1,$date2) ; +# &CollectViews ($date1,$date2) ; + + print "\n\nReady\n\n" ; + exit ; + +sub CollectEdits +{ + my ($date1,$date2) = @_ ; + + $file_date1 = "$dir_in/edits.log-$date1.gz" ; + $file_date2 = "$dir_in/edits.log-$date2.gz" ; + $file_out = "$dir_out/private/SquidDataEditsVizDoNotPublish-$date1.gz" ; + + die ("File not found: $file_date1\n") if (! -e $file_date1) ; + die ("File not found: $file_date2\n") if (! -e $file_date2) ; + +# open OUT, '>', "$dir_out/edits-$date1.txt" ; + $gz_out = gzopen ($file_out, "wb") || die "Unable to write $file_out $!\n" ; + &FilterRequests ($file_date1,$date1,$date2) ; + &FilterRequests ($file_date2,$date1,$date2) ; + $gz_out->gzclose(); +# close OUT ; +} + +sub CollectViews +{ + my ($date1,$date2) = @_ ; + + $file_date1 = "$dir_in/sampled-1000.log-$date1.gz" ; + $file_date1 = "$dir_in/sampled-1000.log-$date2.gz" ; + $file_out = "$dir_out/viz-edits-$date1.gz" ; + + die ("File not found: $file_date1\n") if (! -e $file_date1) ; + die ("File not found: $file_date2\n") if (! -e $file_date2) ; + +# open OUT, '>', "$dir_out/edits-$date1.txt" ; + $gz_out = gzopen ($file_out, "wb") || die "Unable to write $file_out $!\n" ; + &FilterRequests ($file_date1,$date1,$date2) ; + &FilterRequests ($file_date2,$date1,$date2) ; + $gz_out->gzclose(); +# close OUT ; +} + +sub FilterRequests +{ + my ($file,$date1,$date2) = @_ ; + + $date1 = substr ($date1,0,4) . '-' . substr ($date1,4,2) . '-' . substr ($date1,6,2) ; + $date2 = substr ($date2,0,4) . '-' . substr ($date2,4,2) . '-' . substr ($date2,6,2) ; + + print "\n\nFilterRequests $file $date1 $date2\n\n" ; + + # open IN,"-|", "gzip -dc $file" ; + $gz_in = gzopen ($file, "r") || die "Unable to read $file $!\n" ; +# open IN,"<", $file ; + +# while ($line = <IN>) + my $lines = 0 ; + my $lines2 = 0 ; + while ($gz_in->gzreadline ($line) > 0) + { + @fields = split ' ', $line ; + $time = $fields [2] ; + $ip = $fields [4] ; + $action = $fields [5] ; + $url = $fields [8] ; + $agent = lc ($fields [13]) ; + + if ($lines++ % 10000 == 0) + { print "$time\n" ; } + + last if $time =~ /^$date2/ ; # many lines for subsequent data on second file + next if $time !~ /^$date1/ ; # many lines for previous day on first file + + if ($lines2++ == 0) + { + print "\n\nStart copying...\n\n" ; + print "$time\n" ; + } + + next if $url !~ /action=submit/ ; + next if $action ne "TCP_MISS/302" ; + + if (($agent =~ /bot/i) || ($agent =~ /https?:\/\//)) + { $bot = 'B' ; } + else + { $bot = 'M' ; } + + $url =~ s/^.*?\/\/// ; + ($domain,$location) = split ('\/',$url,2) ; + $domain = &Abbreviate ($domain) ; + if (($domain =~ /\./o) || + ($domain !~ /^[\*\@\%]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o)) + { + $unrecognized_domains {$domain_original} ++ ; + $domain = 'other' ; + } + $domain =~ s/!//o ; # not sure why this happens after Abbreviate, kept inline with SquidCountArchiveProcessLogRecord.pm + + $time = substr ($time,0,19) ; # omit msec + $line = "$time,$ip,$domain,$bot\n" ; + $gz_out->gzwrite($line) || die "Zlib error writing to $gzfile: $gz_out->gzerror\n" ; + } + + print "$time\n" ; + + $gz_in->gzclose(); +} + +sub Abbreviate # copied from SquidCountArchiveProcessLogrecord, someday make it separate module +{ + my $domain = shift ; + + $domain =~ s/www\.([^\.]+\.[^\.]+\.[^\.]+)/$1/o ; + $domain =~ s/\.com/\.org/o ; + $domain =~ s/^([^\.]+\.org)/www.$1/o ; + + if ($domain !~ /\.org/o) + { $domain =~ s/www\.(wik[^\.\/]+)\.([^\.\/]+)/$2.$1.org/o ; } + +# $legend = "# wx = wikispecial (commons|mediawiki|meta|foundation|species)\n" ; +# $legend .= "# xx:upload = upload.wikimedia.org\n" ; + $domain =~ s/commons\.wikimedia\.org/!wx:commons/o ; + $domain =~ s/www\.mediawiki\.org/!wx:mediawiki/o ; + $domain =~ s/meta\.wikipedia\.org/!wx:meta/o ; + $domain =~ s/meta\.wikimedia\.org/!wx:meta/o ; + $domain =~ s/foundation\.wikimedia\.org/!wx:foundation/o ; + $domain =~ s/species\.wikimedia\.org/!wx:species/o ; + $domain =~ s/upload\.wikimedia\.org/!xx:upload/o ; + +# $legend .= "# wmf = wikimediafoundation\n" ; +# $legend .= "# wb = wikibooks\n" ; +# $legend .= "# wn = wikinews\n" ; +# $legend .= "# wp = wikipedia\n" ; +# $legend .= "# wq = wikiquote\n" ; +# $legend .= "# ws = wikisource\n" ; +# $legend .= "# wv = wikiversity\n" ; +# $legend .= "# wk = wiktionary\n" ; +# $legend .= "# wm = wikimedia\n" ; +# $legend .= "# mw = mediawiki\n" ; +# $legend .= "# \@ = .mobile.\n" ; +# $legend .= "# \* = .wap.\n" ; +# $legend .= "# \% = .m.\n" ; + + $domain =~ s/wikimediafoundation/!wmf/o ; + $domain =~ s/wikibooks/!wb/o ; + $domain =~ s/wikinews/!wn/o ; + $domain =~ s/wikipedia/!wp/o ; + $domain =~ s/wikiquote/!wq/o ; + $domain =~ s/wikisource/!ws/o ; + $domain =~ s/wikiversity/!wv/o ; + $domain =~ s/wiktionary/!wk/o ; + $domain =~ s/wikimedia/!wm/o ; + $domain =~ s/mediawiki/!mw/o ; + + $domain =~ s/\.mobile\./.@/o ; + $domain =~ s/\.wap\./.*/o ; + $domain =~ s/\.m\./.%/o ; + +# if ($domain =~ /^error:/o) +# { $domain_errors {$domain}++ ; } +# $domain =~ s/error:.*$/!error:1/o ; + + $domain =~ s/^([^\.\/]+)\.([^\.\/]+)\.org/$2:$1/o ; + + $domain =~ s/\s//g ; + + return ($domain) ; +} + Added: trunk/wikistats/squids/VizCollectEventsMonth.pl =================================================================== --- trunk/wikistats/squids/VizCollectEventsMonth.pl (rev 0) +++ trunk/wikistats/squids/VizCollectEventsMonth.pl 2012-04-30 16:52:33 UTC (rev 115089) @@ -0,0 +1,35 @@ +#!/usr/bin/perl + + use lib "/home/ezachte/lib" ; + use EzLib ; + $trace_on_exit = $true ; + ez_lib_version (14) ; + + use Time::Local ; + use Compress::Zlib; + use Getopt::Std ; + + my $options ; + getopt ("m", \%options) ; + $month = $options {"m"} ; + + die "Specify month as -m yyyy/mm" if $month !~ /^\d\d\d\d\/\d\d$/ ; + ($year,$month) = split ('\/', $month) ; + + $days = &days_in_month ($year,$month) ; + for ($day = 1 ; $day <= $days ; $day++) + { + $date = sprintf ("%04d/%02d/%02d",$year,$month,$day) ; ; + $cmd = "\nperl VizCollectEvents.pl -d $date" ; + print "\n$cmd ->\n" ; + + $result = `$cmd` ; + @results = split ("\n", $result) ; + foreach $line (@results) + { print "# $line\n" ; } + } + + print "\n\nReady\n\n" ; + exit ; + + Added: trunk/wikistats/squids/VizPrepJs.pl =================================================================== --- trunk/wikistats/squids/VizPrepJs.pl (rev 0) +++ trunk/wikistats/squids/VizPrepJs.pl 2012-04-30 16:52:33 UTC (rev 115089) @@ -0,0 +1,839 @@ +#!/usr/bin/perl + +# http://tutorialajax.com/compress-javascript-with-gzip.html + + use lib "/home/ezachte/lib" ; + use EzLib ; + $trace_on_exit = $true ; + ez_lib_version (14) ; + + use Time::Local ; + use Geo::IP ; + use Compress::Zlib; + use Getopt::Std ; + + my $options ; + getopt ("mr", \%options) ; + +# if (defined $options {'d'}) +# { +# die "Specify -d or -m , not both" if defined $options {'m'} ; +# $date = $options {"d"} ; +# die "Specify date as -d yyyy/mm/dd" if $date !~ /^\d\d\d\d\/\d\d\/\d\d$/ ; +# ($year,$month,$day) = split ('\/', $date) ; +# } + + if (defined $options {'m'}) + { + $month = $options {"m"} ; + die "Specify month as -m yyyy/mm" if $month !~ /^\d\d\d\d\/\d\d$/ ; + ($year,$month) = split ('\/', $month) ; + } + else + { + ($sec,$min,$hour,$day,$month,$year) = gmtime (time) ; + $year += 1900 ; + $month ++ ; + if ($month > 1) + { $month-- ; } + else + { $month = 12 ; $year-- ; } + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; + } + print "Collect data for month: $yyyymm\n" ; + + @months = qw (January February March April May June July August September October November December) ; + $name_month = @months [$month-1] ; + + # round latitude and longitude to 1/$resolution degrees + $resolution = 8 ; + if (defined $options {'r'}) + { + $resolution = $options {"r"} ; + die "Specify resolution as -r [2|4|8]" if $resolution !~ /^(?:2|4|8)$/ ; + } + + my $gi = Geo::IP -> open ("GeoLiteCity.dat", GEOIP_STANDARD) || die "Could not open GeoLiteCity.dat" ; + + $date1 = sprintf ("%04d%02d%02d", $year, $month, $day) ; + $time = timegm (0,0,0,$day,$month-1,$year-1900) ; + ($sec,$min,$hour,$day2,$month2,$year2) = gmtime ($time+24*3600) ; + $date2 = sprintf ("%04d%02d%02d", $year2+1900, $month2+1, $day2) ; + + if (-d "/a/ezachte") + { + $dir_in = "/a/ezachte/$yyyymm" ; + $dir_out = "/a/ezachte" ; + } + else + { + print "Test on Windows\n" ; + use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib + # use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib + + $dir_in = "." ; + $dir_out = "." ; + } + + &ListLanguages ; + &ReadEdits ; + &ReadViews ; + +# die "Too few edit files found: $days_processed_edits" if $days_processed_edits < 15 ; +# die "Too few view files found: $days_processed_views" if $days_processed_views < 15 ; + + &WriteJs ; + + print "Days processed for edits:$days_processed_edits\n" ; + print "Days processed for views:$days_processed_views\n" ; + + print "\n\nReady\n\n" ; + exit ; + +sub ReadEdits +{ + print "\n\nReadEdits\n\n" ; +# for ($day = 1 ; $day <= 3 ; $day++) + for ($day = 1 ; $day <= days_in_month ($year, $month) ; $day++) + { + # rememember individual edits only for last day for which file was found + undef @data_edits_bots ; + undef %latlong_edits_manual ; + $date_edits = sprintf ("%04d/%02d/%02d", $year, $month, $day) ; + + $yyyymmdd = sprintf ("%04d-%02d-%02d", $year, $month, $day) ; + ($yyyymmdd2 = $yyyymmdd) =~ s/-//g ; + $file = "$dir_in/$yyyymmdd/private/SquidDataEditsVizDoNotPublish-$yyyymmdd2.gz" ; + print "\nProcess $file\n\n" ; + + if (! -e $file) + { print "File not found!\n" ; next ; } + + $days_processed_edits ++ ; + + my $gz_in = new IO::Uncompress::Gunzip $file or die "gunzip failed: $GunzipError\n"; + binmode $gz_in ; + + while ($line = <$gz_in>) + { + # last if $lines_manual > 1000 ; + + chomp $line ; + ($time,$ip,$domain,$bot) = split (',', $line) ; + + next if $domain !~ /wp/ ; + + $rec = $gi -> record_by_name ($ip) ; + next if ! defined $rec ; + + $time = substr ($time,11,8) ; + $hh = substr ($time,0,2) ; + $mm = substr ($time,3,2) ; + $ss = substr ($time,6,2) ; + $min = 60 * $hh + $mm ; + + $domain =~ s/^.*?:// ; + $lang = lc ($domain) ; + + $country_code = $rec->country_code ; + $country_code3 = $rec->country_code3 ; + $country_name = $rec->country_name ; + $region = $rec->region ; + $region_name = $rec->region_name ; + $city = $rec->city ; + $postal_code = $rec->postal_code ; + $latitude = $rec->latitude ; + $longitude = $rec->longitude ; + + $secs = (24*3600 + $hh * 3600 + $mm * 60 + $ss + int(rand(1200)-600)) % (24*3600) ; # drift +/- 10 minutes + + # always use low resolution for individual edits, to protect against de-anonimization + $longitude2 = sprintf ("%.0f", $longitude * 2) + 180 * 2 ; # resolution 1/2 degree + $latitude2 = sprintf ("%.0f", $latitude * 2) + 90 * 2 ; # resolution 1/2 degree + + $longitude = sprintf ("%.0f", $longitude * $resolution) + 180 * $resolution ; # resolution 1/$resolution degree + $latitude = sprintf ("%.0f", $latitude * $resolution) + 90 * $resolution ; # resolution 1/$resolution degree + + $locations {"$longitude,$latitude"} ++ ; + + # musings on performance: + # http://code.flickr.com/blog/2009/03/18/building-fast-client-side-searches/ + + if ($bot eq 'B') + { push @data_edits_bots, "$secs,$lang,$country_code,$longitude2,$latitude2;" ; $lines_bot++ ; } + else + { $latlong_edits_manual {"$longitude2,$latitude2,$country_code"} .= "$secs,$lang;" ; $lines_manual++ ; } + + $line = "$bot,$lines_bot,$lines_manual,$time,$secs,$min,$lang,$country_code,$country_code3,$country_name,$region,$region_name,$city,$postal_code,$longitude,$latitude\n" ; + if (($lines_bot + $lines_manual) % 10000 == 0) + { print $line ; } + + # if ($lang !~ /^(?:en|ja|de|fr|es|ru|it|pt|pl|nl|)$/) + # { $lang = 'xx' ; } + $edits_per_location {"$lang,$longitude,$latitude"} ++ ; + $edits_per_location {"all,$longitude,$latitude"} ++ ; + if ($langcodes {$lang} >= 20) + { $edits_per_location {"rest,$longitude,$latitude"} ++ ; } + + $edits_per_language {$lang} ++ ; + $edits_per_language {"all"} ++ ; + if ($langcodes {$lang} >= 20) + { $edits_per_language {"rest"} ++ ; } + } + close ($gz_in) ; + } + print "Edit files processed: " . (0+$days_processed_edits) . "\n" ; + + return ; +} + +sub ReadViews +{ + print "\n\nReadViews\n\n" ; +# for ($day = 1 ; $day <= 3 ; $day++) + for ($day = 1 ; $day <= days_in_month ($year, $month) ; $day++) + { + $yyyymmdd = sprintf ("%04d-%02d-%02d", $year, $month, $day) ; + ($yyyymmdd2 = $yyyymmdd) =~ s/-//g ; + $file = "$dir_in/$yyyymmdd/private/SquidDataViewsVizDoNotPublish-$yyyymmdd2.gz" ; + print "\nProcess $file\n\n" ; + + if (! -e $file) + { print "File not found!\n" ; next ; } + + $days_processed_views ++ ; + + my $gz_in = new IO::Uncompress::Gunzip $file or die "gunzip failed: $GunzipError\n"; + binmode $gz_in ; + + $lines_read = 0 ; + while ($line = <$gz_in>) + { + # for tests: + # last if $lines_read > 1000 ; + + chomp $line ; + ($time,$ip,$domain,$bot,$mobile,$os,$client,$mimecat) = split (',', $line) ; + + next if $domain !~ /wp/ ; + next if $mimecat ne "page" ; + + $rec = $gi -> record_by_name ($ip) ; + next if ! defined $rec ; + next if $bot eq 'B' ; + + $mobile_os_client {"$mobile,$os,$client"} ++ ; + + $time = substr ($time,11,8) ; + $hh = substr ($time,0,2) ; + $mm = substr ($time,3,2) ; + $ss = substr ($time,6,2) ; + $min = 60 * $hh + $mm ; + + $domain =~ s/^.*?:// ; + $lang = lc ($domain) ; + + $country_code = $rec->country_code ; + $country_code3 = $rec->country_code3 ; + $country_name = $rec->country_name ; + $region = $rec->region ; + $region_name = $rec->region_name ; + $city = $rec->city ; + $postal_code = $rec->postal_code ; + $latitude = $rec->latitude ; + $longitude = $rec->longitude ; + + $secs = (24*3600 + $hh * 3600 + $mm * 60 + $ss + int(rand(1200)-600)) % (24*3600) ; # drift +/- 10 minutes + + # next if $secs < 12 * 3600 ; # one hour only for iPod iPhone + # next if $secs > 13 * 3600 ; # one hour only for iPod iPhone + + $longitude = sprintf ("%.0f", $longitude * $resolution) + 180 * $resolution ; # resolution 1/$resolution degree + $latitude = sprintf ("%.0f", $latitude * $resolution) + 90 * $resolution ; # resolution 1/$resolution degree + + $locations {"$longitude,$latitude"} ++ ; + + $line = "$time,$secs,$min,$lang,$country_code,$country_code3,$country_name,$region,$region_name,$city,$postal_code,$longitude,$latitude\n" ; + if ($lines_read++ % 10000 == 0) + { print $line ; } + + # if ($lang !~ /^(?:en|ja|de|fr|es|ru|it|pt|pl|nl|)$/) + # { $lang = 'xx' ; } + + $views_per_location {"$lang,$longitude,$latitude"} ++ ; + $views_per_location {"all,$longitude,$latitude"} ++ ; + if ($langcodes {$lang} >= 20) + { $views_per_location {"rest,$longitude,$latitude"} ++ ; } + + $views_per_language {$lang} ++ ; + $views_per_language {"all"} ++ ; + if ($langcodes {$lang} >= 20) + { $views_per_language {"rest"} ++ ; } + + $platform_per_location {"$mobile,$longitude,$latitude"} ++ ; + } + close ($gz_in) ; + } + print "View files processed: " . (0+$days_processed_views) . "\n" ; + +# open CSV_TEST, ">", "test.csv" ; +# foreach $key (sort {$mobile_os_client {$b} <=> $mobile_os_client {$a}} keys %mobile_os_client) +# { print CSV_TEST "$key," . $mobile_os_client {$key} . "\n" ; } +# close CSV_TEST ; +} + +sub WriteJs +{ +# @data_edits_manual = sort {$a <=> $b} @data_edits_manual ; +# $data_edits_manual = join ('', @data_edits_manual) ; + foreach $latlong (sort keys %latlong_edits_manual) + { + $coords ++ ; + $edits = $latlong_edits_manual {$latlong} ; + $edits =~ s/;$// ; + @edits = split (';', $edits) ; + @edits = sort {$a <=> $b} @edits ; + + $time_prev = 0 ; + foreach $edit (@edits) + { + ($time,$lang) = split (',', $edit) ; + $edit = ($time - $time_prev) . ",$lang" ; + $time_prev = $time ; + } + $edits = join (';', @edits) ; + $data_edits_manual .= "$latlong|$edits^" ; + } + $data_edits_manual =~ s/\^$// ; + +# at least as long as server side compression does't work +# shrink file significantly further, to be patched back in browser + $data_edits_manual =~ s/,en;/E/g ; + $data_edits_manual =~ s/,ja;/J/g ; + $data_edits_manual =~ s/,de;/D/g ; + $data_edits_manual =~ s/,fr;/F/g ; + $data_edits_manual =~ s/,es;/S/g ; + $data_edits_manual =~ s/,ru;/R/g ; + $data_edits_manual =~ s/,it;/I/g ; + $data_edits_manual =~ s/,pt;/P/g ; + $data_edits_manual =~ s/,pl;/L/g ; + $data_edits_manual =~ s/,nl;/N/g ; + + foreach $key (sort keys %edits_per_location) + { + # store daily averages + $data_edit_freq .= $key . ',' . sprintf ("%.0f", $edits_per_location {$key} / $days_processed_edits) . ';' ; + $lines_frequency++ ; + } + $data_edit_freq =~ s/;$// ; + + foreach $location (sort keys %locations) + { + $desktop = (0 + $platform_per_location {"-,$location"}) ; + $mobile = (0 + $platform_per_location {"M,$location"}) ; + + $tot_views_desktop += $desktop ; + $tot_views_mobile += $mobile ; + + if ($desktop + $mobile >= 10) + { + $perc_mobile = sprintf ("%.0f", 100 * ($mobile / ($desktop + $mobile))) ; + print "$location DESKTOP $desktop MOBILE $mobile PERC $perc_mobile\n" ; + } + + # store daily averages + $desktop = sprintf ("%.0f", $desktop / $days_processed_views) ; + $mobile = sprintf ("%.0f", $mobile / $days_processed_views) ; + next if $desktop + $mobile == 0 ; + + $data_view_freq .= "$location,$desktop,$mobile;" ; + $lines_viewstats++ ; + } + + $tot_views_desktop = sprintf ("%.0f", $tot_views_desktop / $days_processed_views) ; + $tot_views_mobile = sprintf ("%.0f", $tot_views_mobile / $days_processed_views) ; + print "Views per day: " . ($tot_views_desktop + $tot_views_mobile) . "\n" ; + print "Percentage mobile: " . sprintf ("%.1f\%", 100 * $tot_views_mobile / ($tot_views_desktop + $tot_views_mobile)) . "\n" ; + + $data_view_freq =~ s/;$// ; + + $data_languages_sort_by_code =~ s/'/"e;/g ; + $data_languages_sort_by_edits =~ s/'/"e;/g ; + + open MANUAL, '>', "WikipediaRequestsInit-$yyyymm-$resolution.js" ; + print MANUAL "var resolution = $resolution ;\n" ; + print MANUAL "var date_edits = '$date_edits' ;\n" ; + print MANUAL "var name_month = '$name_month' ;\n\n" ; + print MANUAL "function data_init()\n{\n" ; + print MANUAL "data_languages_sort_by_code='$data_languages_sort_by_code';\n\n" ; + print MANUAL "data_languages_sort_by_edits='$data_languages_sort_by_edits';\n\n" ; + print MANUAL "data_edits='$data_edits_manual';\n\n" ; + print MANUAL "data_freq='$data_edit_freq';\n\n" ; + print MANUAL "data_viewstats='$data_view_freq';" ; + print MANUAL "\n}\n" ; + close MANUAL ; + + @data_edits_bots = sort {$a <=> $b} @data_edits_bots ; + $data_edits_bots = join ('', @data_edits_bots) ; + $data_edits_bots =~ s/;\\\n$// ; + open BOTS, '>', "WikipediaBotEditsInit-$yyyymm.js" ; + print BOTS "var resolution = $resolution ;\n" ; + print BOTS "function init()\n{\ndata_bots='$data_edits_bots';\n}\n" ; + close BOTS ; + + print "\nBot events written: $lines_bot\n\n" ; + print "\nEdits written: $lines_manual single edits, $lines_frequency points\n\n" ; + + print "Lat/long pairs written: $coords\n\n" ; + + foreach $lang (sort {$edits_per_language {$b} <=> $edits_per_language {$a}} keys %edits_per_language) + { print "$lang:" . $edits_per_language {$lang} . "\n" ; } +} + +sub ListLanguages +{ + &SetLanguageInfo ; + + die "Could not open 'StatisticsMonthly.csv'" if ! -e 'StatisticsMonthly.csv' ; + + # counts months history per language + open CSV, '<', 'StatisticsMonthly.csv' ; + while ($line = <CSV>) + { + chomp $line ; + ($lang,$date,@fields) = split (',' , $line) ; + $months {$lang} ++ ; + } + close CSV ; + + # for last 12 months per language accumulate edits + $lang_prev = '' ; + open CSV, '<', 'StatisticsMonthly.csv' ; + while ($line = <CSV>) + { + chomp $line ; + ($lang,$date,@fields) = split (',' , $line) ; + $months2 {$lang} ++ ; + if (($lang ne $lang_prev) && ($lang_prev ne '')) + { + next if $lang eq 'commons' ; + $edits_per_lang_last_12_months {$lang_prev} = $edits_prev ; + print "$lang_prev,$date_prev,$edits_prev\n" ; + $edits_prev = 0 ; + } + $lang_prev = $lang ; + $date_prev = $date ; + if ($months2 {$lang} > $months {$lang} - 12) + { $edits_prev += $fields [11] ; } + } + $edits_per_lang_last_12_months {$lang_prev} = $edits_prev ; + print "$lang_prev,$date_prev,$edits_prev\n" ; + close CSV ; + + foreach $lang (sort keys %edits_per_lang_last_12_months) + { + $lang_name = $out_languages {$lang} ; + next if $lang_name eq '' ; + $data_languages_sort_by_code .= "$lang:$lang_name;" ; + } + $data_languages_sort_by_code =~ s/;$// ; + + foreach $lang (sort {$edits_per_lang_last_12_months {$b} <=> $edits_per_lang_last_12_months {$a}} keys %edits_per_lang_last_12_months) + { + $lang_name = $out_languages {$lang} ; + + next if $lang_name eq '' ; + $langcodes {$lang} = $count_languages++ ; + # print "$lang: ${langcodes {$lang}}\n" ; + + $data_languages_sort_by_edits .= "$lang:$lang_name;" ; + } + $data_languages_sort_by_edits =~ s/;$// ; + +# print "$data_languages_sort_by_code\n\n" ; +# print "$data_languages_sort_by_edits\n\n" ; +} + +sub SetLanguageInfo +{ + # taken from http://meta.wikimedia.org/wiki/List_of_Wikipedias + # see also http://www.loc.gov/standards/iso639-2/php/English_list.php + # url might have been generated from language code, but there were (and will be?) exceptions + # see also http://meta.wikimedia.org/wiki/Special:SiteMatrix + # latest language name corrections provided by Mark Williamson + # see also http://meta.wikimedia.org/wiki/Languages + + # numbers in square brackets: number of speakers in millions according to + # http://en.wikipedia.org/w/index.php?title=List_of_languages_by_number_of_native_speakers&oldid=305926069 (Aug 5 2009) + # includes secondary speakers (hence adds up to much more than 6 billion) + %wikipedias = ( +# mediawiki=>"http://wikimediafoundation.org Wikimedia", + nostalgia=>"http://nostalgia.wikipedia.org Nostalgia", + sources=>"http://wikisource.org Multilingual Wikisource", + meta=>"http://meta.wikimedia.org Meta-Wiki", + beta=>"http://beta.wikiversity.org Beta", + species=>"http://species.wikipedia.org Wikispecies", + commons=>"http://commons.wikimedia.org Commons", + foundation=>"http://wikimediafoundation.org Foundation", + strategy=>"http://strategy.wikimedia.org Strategic Planning", + outreach=>"http://outreach.wikimedia.org Outreach", + incubator=>"http://incubator.wikimedia.org Incubator", + usability=>"http://usability.wikimedia.org Usability Initiative", + sep11=>"http://sep11.wikipedia.org In Memoriam", + nlwikimedia=>"http://nl.wikimedia.org Wikimedia Nederland", + plwikimedia=>"http://pl.wikimedia.org Wikimedia Polska", + mediawiki=>"http://www.mediawiki.org MediaWiki", + dewikiversity=>"http://de.wikiversity.org Wikiversität", + frwikiversity=>"http://fr.wikiversity.org Wikiversität", + wikimania2005=>"http://wikimania2005.wikimedia.org Wikimania 2005", + wikimania2006=>"http://wikimania2006.wikimedia.org Wikimania 2006", + aa=>"http://aa.wikipedia.org Afar [1.4,AF]", + ab=>"http://ab.wikipedia.org Abkhazian [0.125,AS]", + ace=>"http://ace.wikipedia.org Acehnese [3,AS]", + af=>"http://af.wikipedia.org Afrikaans [13,AF]", + ak=>"http://ak.wikipedia.org Akan [19,AF]", + als=>"http://als.wikipedia.org Alemannic [10,EU]", # was Elsatian + am=>"http://am.wikipedia.org Amharic [25,AF]", + an=>"http://an.wikipedia.org Aragonese [0.01,EU]", + ang=>"http://ang.wikipedia.org Anglo-Saxon [,EU]", + ar=>"http://ar.wikipedia.org Arabic [530,AF,AS]", + arc=>"http://arc.wikipedia.org Aramaic [2.2,AS]", + arz=>"http://arz.wikipedia.org Egyptian Arabic [76,AF]", + as=>"http://as.wikipedia.org Assamese [13,AS,I]", + ast=>"http://ast.wikipedia.org Asturian [0.275,EU]", + av=>"http://av.wikipedia.org Avar [0.06,EU]", + ay=>"http://ay.wikipedia.org Aymara [2.2,SA]", + az=>"http://az.wikipedia.org Azeri [27,AS]", + ba=>"http://ba.wikipedia.org Bashkir [1.9,AS]", + bar=>"http://bar.wikipedia.org Bavarian [12,EU]", + bat_smg=>"http://bat-smg.wikipedia.org Samogitian [0.5,EU]", + "bat-smg"=>"http://bat-smg.wikipedia.org Samogitian", + bcl=>"http://bcl.wikipedia.org Central Bicolano [2.5,AS]", + be=>"http://be.wikipedia.org Belarusian [6.5,EU]", + "be-x-old"=>"http://be-x-old.wikipedia.org Belarusian (Tarakievica) [6.5,EU]", + be_x_old=>"http://be-x-old.wikipedia.org Belarusian (Tarakievica) [6.5,EU]", + bg=>"http://bg.wikipedia.org Bulgarian [12,EU]", + bh=>"http://bh.wikipedia.org Bihari [,AS,I]", + bi=>"http://bi.wikipedia.org Bislama [0.2,OC]", + bjn=>"http://bjn.wikipedia.org Banjar [3.5,AS]", + bm=>"http://bm.wikipedia.org Bambara [6,AF]", + bn=>"http://bn.wikipedia.org Bengali [230,AS,I]", + bo=>"http://bo.wikipedia.org Tibetan [7,AS]", + bpy=>"http://bpy.wikipedia.org Bishnupriya Manipuri [0.45,AS,I]", + br=>"http://br.wikipedia.org Breton [0.25,EU]", + bs=>"http://bs.wikipedia.org Bosnian [2.7,EU]", + bug=>"http://bug.wikipedia.org Buginese [4,AS]", + bxr=>"http://bxr.wikipedia.org Buryat [0.4,AS]", + ca=>"http://ca.wikipedia.org Catalan [9,EU]", + cbk_zam=>"http://cbk-zam.wikipedia.org Chavacano [0.607,AS]", + "cbk-zam"=>"http://cbk-zam.wikipedia.org Chavacano", + cdo=>"http://cdo.wikipedia.org Min Dong [9.1,AS,C]", + ce=>"http://ce.wikipedia.org Chechen [1.33,EU]", + ceb=>"http://ceb.wikipedia.org Cebuano [20,AS]", + ch=>"http://ch.wikipedia.org Chamorro [0.06,OC]", + cho=>"http://cho.wikipedia.org Choctaw [0.0179,NA]", # was Chotaw + chr=>"http://chr.wikipedia.org Cherokee [0.018,NA]", + chy=>"http://chy.wikipedia.org Cheyenne [0.000712,NA]", + ckb=>"http://ckb.wikipedia.org Sorani [4,AS]", + co=>"http://co.wikipedia.org Corsican [0.25,EU]", + cr=>"http://cr.wikipedia.org Cree [0.117,NA]", + crh=>"http://crh.wikipedia.org Crimean Tatar [0.456,EU,AS]", + cs=>"http://cs.wikipedia.org Czech [12,EU]", + csb=>"http://csb.wikipedia.org Cassubian [0.05,EU]", + cu=>"http://cv.wikipedia.org Old Church Slavonic [,EU]", + cv=>"http://cv.wikipedia.org Chuvash [1.3,AS]", + cy=>"http://cy.wikipedia.org Welsh [0.75,EU]", + da=>"http://da.wikipedia.org Danish [6,EU]", + de=>"http://de.wikipedia.org German [185,EU]", + diq=>"http://diq.wikipedia.org Zazaki [2,AS]", + dk=>"http://dk.wikipedia.org Danish [6]", + dsb=>"http://dsb.wikipedia.org Lower Sorbian [0.014,EU]", + dv=>"http://dv.wikipedia.org Divehi [0.3,AS,I]", + dz=>"http://dz.wikipedia.org Dzongkha [0.6,AS,I]", + ee=>"http://ee.wikipedia.org Ewe [3.5,AF]", + el=>"http://el.wikipedia.org Greek [15,EU]", + eml=>"http://eml.wikipedia.org Emilian-Romagnol [2,EU]", + en=>"http://en.wikipedia.org English [1500,EU,NA,OC,AS,AF]", + eo=>"http://eo.wikipedia.org Esperanto [1.1,AL]", + es=>"http://es.wikipedia.org Spanish [500,EU,NA,SA,AS,AF]", + et=>"http://et.wikipedia.org Estonian [1.25,EU]", + eu=>"http://eu.wikipedia.org Basque [1.06,EU]", + ext=>"http://ext.wikipedia.org Extremaduran [0.5,EU]", + fa=>"http://fa.wikipedia.org Persian [107,AS]", + ff=>"http://ff.wikipedia.org Fulfulde [13,AF]", + fi=>"http://fi.wikipedia.org Finnish [6,EU]", + "fiu-vro"=>"http://fiu-vro.wikipedia.org Voro [0.07,EU]", + fiu_vro=>"http://fiu-vro.wikipedia.org Voro [0.07,EU]", + fj=>"http://fj.wikipedia.org Fijian [0.55,OC]", + fo=>"http://fo.wikipedia.org Faroese [0.07,EU]", # was Faeroese + fr=>"http://fr.wikipedia.org French [200,EU,NA,AF,OC]", + frp=>"http://frp.wikipedia.org Arpitan [0.113,EU]", + frr=>"http://frr.wikipedia.org North Frisian [0.01,EU]", + fur=>"http://fur.wikipedia.org Friulian [0.794,EU]", + fy=>"http://fy.wikipedia.org Frisian [0.65,EU]", + ga=>"http://ga.wikipedia.org Irish [0.53,EU]", + gan=>"http://gan.wikipedia.org Gan [35,AS,C]", + gay=>"http://gay.wikipedia.org Gayo", + gd=>"http://gdi.wikipedia.org Scots Gaelic [0.07,EU]", # was Scottish Gaelic + gl=>"http://gl.wikipedia.org Galician [3.5,EU]", # was Galego + glk=>"http://glk.wikipedia.org Gilaki [3.3,AS]", + gn=>"http://gn.wikipedia.org Guarani [7,SA]", + got=>"http://got.wikipedia.org Gothic [,EU]", + gu=>"http://gu.wikipedia.org Gujarati [46,AS,I]", + gv=>"http://gv.wikipedia.org Manx [0.0017,EU]", # was Manx Gaelic + ha=>"http://ha.wikipedia.org Hausa [39,AF]", + hak=>"http://hak.wikipedia.org Hakka [34,AS,C]", + haw=>"http://haw.wikipedia.org Hawai'ian [0.027,OC]", # was Hawaiian + he=>"http://he.wikipedia.org Hebrew [10,AS]", + hi=>"http://hi.wikipedia.org Hindi [550,AS]", + hif=>"http://hif.wikipedia.org Fiji Hindi [0.46,OC]", + ho=>"http://ho.wikipedia.org Hiri Motu", + hr=>"http://hr.wikipedia.org Croatian [6.2,EU]", + hsb=>"http://hsb.wikipedia.org Upper Sorbian [0.04,EU]", + ht=>"http://ht.wikipedia.org Haitian [12,NA]", + hu=>"http://hu.wikipedia.org Hungarian [15,EU]", + hy=>"http://hy.wikipedia.org Armenian [5.5,AS]", + hz=>"http://hz.wikipedia.org Herero [0.13,AF]", + ia=>"http://ia.wikipedia.org Interlingua [,AL]", + iba=>"http://iba.wikipedia.org Iban", + id=>"http://id.wikipedia.org Indonesian [250,AS]", + ie=>"http://ie.wikipedia.org Interlingue [,AL]", + ig=>"http://ig.wikipedia.org Igbo [22.5,AF]", + ii=>"http://ii.wikipedia.org Yi [2,AS,C]", + ik=>"http://ik.wikipedia.org Inupiak [0.0021,NA]", + ilo=>"http://ilo.wikipedia.org Ilokano [10,AS]", + io=>"http://io.wikipedia.org Ido [,AL]", + is=>"http://is.wikipedia.org Icelandic [0.32,EU]", + it=>"http://it.wikipedia.org Italian [70,EU]", + iu=>"http://iu.wikipedia.org Inuktitut [0.03,NA]", + ja=>"http://ja.wikipedia.org Japanese [132,AS]", + jbo=>"http://jbo.wikipedia.org Lojban [,AL]", + jv=>"http://jv.wikipedia.org Javanese [80,AS]", + ka=>"http://ka.wikipedia.org Georgian [4.2,EU]", + kaa=>"http://kaa.wikipedia.org Karakalpak [0.41,AS]", + kab=>"http://ka.wikipedia.org Kabyle [8,AF]", + kaw=>"http://kaw.wikipedia.org Kawi", + kg=>"http://kg.wikipedia.org Kongo [7,AF]", + ki=>"http://ki.wikipedia.org Kikuyu [5.4,AF]", + kj=>"http://kj.wikipedia.org Kuanyama", + kk=>"http://kk.wikipedia.org Kazakh [12,AS]", + kl=>"http://kl.wikipedia.org Greenlandic [0.05,NA]", + km=>"http://km.wikipedia.org Khmer [18.5,AS]", # was Cambodian + kn=>"http://kn.wikipedia.org Kannada [47,AS,I]", + ko=>"http://ko.wikipedia.org Korean [78,AS]", + koi=>"http://koi.wikipedia.org Komi-Permyak [0.094,EU]", + kr=>"http://kr.wikipedia.org Kanuri [4,AF]", + ks=>"http://ks.wikipedia.org Kashmiri [4.6,AS,I]", + ksh=>"http://ksh.wikipedia.org Ripuarian [0.25,EU]", + ku=>"http://ku.wikipedia.org Kurdish [26,AS]", + kv=>"http://kv.wikipedia.org Komi [0.293,EU]", + kw=>"http://kw.wikipedia.org Cornish [0.000245,EU]", # was Kornish + ky=>"http://ky.wikipedia.org Kirghiz [5,AS]", + la=>"http://la.wikipedia.org Latin [,W]", + lad=>"http://lad.wikipedia.org Ladino [0.109,AS]", + lb=>"http://lb.wikipedia.org Luxembourgish [0.39,EU]", # was Letzeburgesch + lbe=>"http://lbe.wikipedia.org Lak [0.12,AS]", + lg=>"http://lg.wikipedia.org Ganda [10,AF]", + li=>"http://li.wikipedia.org Limburgish [1.6,EU]", + lij=>"http://lij.wikipedia.org Ligurian [1.9,EU]", + lmo=>"http://lmo.wikipedia.org Lombard [3,EU]", + ln=>"http://ln.wikipedia.org Lingala [25,AF]", + lo=>"http://lo.wikipedia.org Laotian [5.2,AS]", + ls=>"http://ls.wikipedia.org Latino Sine Flexione", + lt=>"http://lt.wikipedia.org Lithuanian [3.5,EU]", + lv=>"http://lv.wikipedia.org Latvian [1.6,EU]", + mad=>"http://mad.wikipedia.org Madurese [14]", + mak=>"http://mak.wikipedia.org Makasar [2]", + map_bms=>"http://map-bms.wikipedia.org Banyumasan [13.5,AS]", + "map-bms"=>"http://map-bms.wikipedia.org Banyumasan", + mdf=>"http://mdf.wikipedia.org Moksha [0.5,EU]", + mg=>"http://mg.wikipedia.org Malagasy [20,AF]", + mh=>"http://mh.wikipedia.org Marshallese [0.0439,OC]", + mhr=>"http://mhr.wikipedia.org Eastern Mari [0.3,EU]", + mi=>"http://mi.wikipedia.org Maori [0.157,OC]", + min=>"http://min.wikipedia.org Minangkabau [6.5]", + minnan=>"http://minnan.wikipedia.org Minnan", + mk=>"http://mk.wikipedia.org Macedonian [2.7,EU]", + ml=>"http://ml.wikipedia.org Malayalam [37,AS,I]", + mn=>"http://mn.wikipedia.org Mongolian [5.2,AS]", + mo=>"http://mo.wikipedia.org Moldavian [,EU]", + mr=>"http://mr.wikipedia.org Marathi [90,AS,I]", + mrj=>"http://mrj.wikipedia.org Western Mari [0.3,A]", + ms=>"http://ms.wikipedia.org Malay [300,AS]", + mt=>"http://mt.wikipedia.org Maltese [0.37,EU]", + mus=>"http://mus.wikipedia.org Muskogee [0.006,NA]", + mwl=>"http://mwl.wikipedia.org Mirandese [0.015,EU]", + my=>"http://my.wikipedia.org Burmese [52,AS]", + myv=>"http://myv.wikipedia.org Erzya [0.5,AS]", + mzn=>"http://mzn.wikipedia.org Mazandarani [3.7,AS]", + na=>"http://na.wikipedia.org Nauruan [0.006,OC]", # was Nauru + nah=>"http://nah.wikipedia.org Nahuatl [1.45,NA]", + nap=>"http://nap.wikipedia.org Neapolitan [7.5,EU]", + nds=>"http://nds.wikipedia.org Low Saxon [10,EU]", + nds_nl=>"http://nds-nl.wikipedia.org Dutch Low Saxon [10,EU]", + "nds-nl"=>"http://nds-nl.wikipedia.org Dutch Low Saxon [10,EU]", + ne=>"http://ne.wikipedia.org Nepali [30,AS,I]", + new=>"http://new.wikipedia.org Nepal Bhasa [0.8,AS,I]", + ng=>"http://ng.wikipedia.org Ndonga [0.690,AF]", + nl=>"http://nl.wikipedia.org Dutch [27,EU,SA]", + nov=>"http://nov.wikipedia.org Novial [,AL]", + nrm=>"http://nrm.wikipedia.org Norman [,EU]", + nn=>"http://nn.wikipedia.org Nynorsk [4.7,EU]", # was Neo-Norwegian + no=>"http://no.wikipedia.org Norwegian [4.7,EU]", + nv=>"http://nv.wikipedia.org Navajo [0.178,NA]", + ny=>"http://ny.wikipedia.org Chichewa [9.3,AF]", + oc=>"http://oc.wikipedia.org Occitan [1.9,EU]", + om=>"http://om.wikipedia.org Oromo [25.5,AF]", + or=>"http://or.wikipedia.org Oriya [31,AS,I]", + os=>"http://os.wikipedia.org Ossetic [0.52,AS]", + pa=>"http://pa.wikipedia.org Punjabi [104,AS,I]", + pag=>"http://pag.wikipedia.org Pangasinan [1.5,AS]", + pam=>"http://pam.wikipedia.org Kapampangan [2.9,AS]", + pap=>"http://pap.wikipedia.org Papiamentu [0.329,SA]", + pcd=>"http://pcd.wikipedia.org Picard [,EU]", + pdc=>"http://pdc.wikipedia.org Pennsylvania German [0.250,NA]", + pi=>"http://pi.wikipedia.org Pali [,AS]", + pih=>"http://pih.wikipedia.org Norfolk [0.0006,OC]", + pl=>"http://pl.wikipedia.org Polish [43,EU]", + pms=>"http://pms.wikipedia.org Piedmontese [2,EU]", + pnb=>"http://pnb.wikipedia.org Western Panjabi [60,AS]", + pnt=>"http://pnt.wikipedia.org Pontic [0.325,EU]", + ps=>"http://ps.wikipedia.org Pashto [26,AS]", + pt=>"http://pt.wikipedia.org Portuguese [290,EU,SA,AF,AS]", + qu=>"http://qu.wikipedia.org Quechua [10.4,SA]", + rue=>"http://rue.wikipedia.org Rusyn [0.6,EU]", + rm=>"http://rm.wikipedia.org Romansh [0.035,EU]", # was Rhaeto-Romance + rmy=>"http://rmy.wikipedia.org Romani [2.5,EU]", + rn=>"http://rn.wikipedia.org Kirundi [4.6,AF]", + ro=>"http://ro.wikipedia.org Romanian [28,EU]", + roa_rup=>"http://roa-rup.wikipedia.org Aromanian [0.3,EU]", + "roa-rup"=>"http://roa-rup.wikipedia.org Aromanian [0.5]", + roa_tara=>"http://roa-tara.wikipedia.org Tarantino [0.9,EU]", + "roa-tara"=>"http://roa-tara.wikipedia.org Tarantino", + ru=>"http://ru.wikipedia.org Russian [278,EU,AS]", + ru_sib=>"http://ru-sib.wikipedia.org Siberian", + "ru-sib"=>"http://ru-sib.wikipedia.org Siberian", + rw=>"http://rw.wikipedia.org Kinyarwanda [12,AF]", + sa=>"http://sa.wikipedia.org Sanskrit [0.05,AS,I]", + sah=>"http://sah.wikipedia.org Sakha [0.456,AS]", + sc=>"http://sc.wikipedia.org Sardinian [1.85,EU]", + scn=>"http://scn.wikipedia.org Sicilian [8,EU]", + sco=>"http://sco.wikipedia.org Scots [1.5,EU]", + sd=>"http://sd.wikipedia.org Sindhi [41,AS,I]", + se=>"http://se.wikipedia.org Northern Sami [0.02,EU]", + sg=>"http://sg.wikipedia.org Sangro [3,AF]", + sh=>"http://sh.wikipedia.org Serbo-Croatian [23,EU]", + si=>"http://si.wikipedia.org Sinhala [19,AS]", + simple=>"http://simple.wikipedia.org Simple English [1500,EU,NA,OC,AS,AF]", + sk=>"http://sk.wikipedia.org Slovak [7,EU]", + sl=>"http://sl.wikipedia.org Slovene [2.4,EU]", + sm=>"http://sm.wikipedia.org Samoan [0.370,OC]", + sn=>"http://sn.wikipedia.org Shona [7,AF]", + so=>"http://so.wikipedia.org Somali [13.5,AF]", + sq=>"http://sq.wikipedia.org Albanian [6,EU]", + sr=>"http://sr.wikipedia.org Serbian [12,EU]", + srn=>"http://srn.wikipedia.org Sranan [0.3,SA]", + ss=>"http://ss.wikipedia.org Siswati [3,AF]", + st=>"http://st.wikipedia.org Sesotho [4.9,AF]", + stq=>"http://stq.wikipedia.org Saterland Frisian [0.002,EU]", + su=>"http://su.wikipedia.org Sundanese [27,AS]", + sv=>"http://sv.wikipedia.org Swedish [10,EU]", + sw=>"http://sw.wikipedia.org Swahili [50,AF]", + szl=>"http://szl.wikipedia.org Silesian [0.056,EU]", + ta=>"http://ta.wikipedia.org Tamil [66,AS,I]", + te=>"http://te.wikipedia.org Telugu [80,AS,I]", + test=>"http://test.wikipedia.org Test", + tet=>"http://tet.wikipedia.org Tetum [0.8,AS]", + tg=>"http://tg.wikipedia.org Tajik [4.4,AS]", + th=>"http://th.wikipedia.org Thai [73,AS]", + ti=>"http://ti.wikipedia.org Tigrinya [6.7,AF]", + tk=>"http://tk.wikipedia.org Turkmen [9,AS]", + tl=>"http://tl.wikipedia.org Tagalog [90,AS]", + tlh=>"http://tlh.wikipedia.org Klingon", # was Klignon + tn=>"http://tn.wikipedia.org Setswana [4.4,AF]", + to=>"http://to.wikipedia.org Tongan [0.105,OC]", + tokipona=>"http://tokipona.wikipedia.org Tokipona", + tpi=>"http://tpi.wikipedia.org Tok Pisin [5.5,AS]", + tr=>"http://tr.wikipedia.org Turkish [70,EU,AS]", + ts=>"http://ts.wikipedia.org Tsonga [3.3,AF]", + tt=>"http://tt.wikipedia.org Tatar [8,AS]", + tum=>"http://tum.wikipedia.org Tumbuka [2,AF]", + turn=>"http://turn.wikipedia.org Turnbuka", + tw=>"http://tw.wikipedia.org Twi [14.8,AF]", + ty=>"http://ty.wikipedia.org Tahitian [0.120,OC]", + udm=>"http://udm.wikipedia.org Udmurt [0.550,AS]", + ug=>"http://ug.wikipedia.org Uyghur [10,AS,C]", + uk=>"http://uk.wikipedia.org Ukrainian [45,EU]", + ur=>"http://ur.wikipedia.org Urdu [60,AS,I]", + uz=>"http://uz.wikipedia.org Uzbek [23.5,AS]", + ve=>"http://ve.wikipedia.org Venda [0.875,AF]", + vec=>"http://vec.wikipedia.org Venetian [2.3,EU]", + vi=>"http://vi.wikipedia.org Vietnamese [80,AS]", + vls=>"http://vls.wikipedia.org West Flemish [1.06,EU]", + vo=>"http://vo.wikipedia.org Volap\xFCk [0.000010,AL]", + wa=>"http://wa.wikipedia.org Walloon [0.6,EU]", + war=>"http://war.wikipedia.org Waray-Waray [3.1,AS]", + wo=>"http://wo.wikipedia.org Wolof [3.6,AF]", + wuu=>"http://wuu.wikipedia.org Wu [77,AS,C]", + xal=>"http://xal.wikipedia.org Kalmyk [0.174,EU]", + xh=>"http://xh.wikipedia.org Xhosa [7.9,AF]", + yi=>"http://yi.wikipedia.org Yiddish [3.2,W]", + yo=>"http://yo.wikipedia.org Yoruba [25,AF]", + za=>"http://za.wikipedia.org Zhuang [14,AS,C]", + zea=>"http://zea.wikipedia.org Zealandic [0.220,EU]", + zh=>"http://zh.wikipedia.org Chinese [1300,AS]", + zh_min_nan=>"http://zh-min-nan.wikipedia.org Min Nan [49,AS,C]", + "zh-min-nan"=>"http://zh-min-nan.wikipedia.org Min Nan [60]", + zh_classical=>"http://zh-classical.wikipedia.org Classical Chinese [,AS,C]", + "zh-classical"=>"http://zh-classical.wikipedia.org Classical Chinese [,AS,C]", + zh_yue=>"http://zh-yue.wikipedia.org Cantonese [71,AS,C]", + "zh-yue"=>"http://zh-yue.wikipedia.org Cantonese [71,AS,C]", + zu=>"http://zu.wikipedia.org Zulu [26,AF]", + zz=>" All languages", + zzz=>" All languages except English" + ); + + # provide default, may be overruled at localization file + foreach $key (keys %wikipedias) + { + my $wikipedia = $wikipedias {$key} ; + if ($wikipedia =~ /\[.*\]/) + { + $wikipedia2 = $wikipedia ; + $wikipedia2 =~ s/^.*?\[// ; + $wikipedia2 =~ s/\].*$// ; + ($speakers, $regions) = split (',', $wikipedia2,2) ; + @regions = split (',', $regions) ; + $out_speakers {$key} = $speakers ; + + if ($speakers > $speakers_max) + { $speakers_max = $speakers ; } + + foreach $region (@regions) + { + if (length ($region) != 2) # land codes China, India + { $region = "" ; } + } + @regions = sort {$a cmp $b} @regions ; + $out_regions {$key} = join (',', @regions) ; + $regions = join (',', @regions) ; + } + $wikipedia =~ s/\s*\[..*$// ; # remove speakers + $out_urls {$key} = $wikipedia ; + $out_languages {$key} = $wikipedia ; + + if (($key !~ /_/) && ($key !~ /(?:nostalgia|sep11|species)/) && ($wikipedia =~ /wikipedia.org/)) # fiu-vro yes, fiu_vro no / also meta, commons etc no + { + ($key2 = $key) =~ s/"//g ; + push @real_languages, $key2 ; + } + + $out_urls {$key} =~ s/(^[^\s]+).*$/$1/ ; + $out_languages {$key} =~ s/^[^\s]+\s+(.*)$/$1/ ; + $out_article {$key} = "http://en.wikipedia.org/wiki/" . $out_languages {$key} . "_language" ; + $out_article {$key} =~ s/ /_/g ; + + $out_urls {$key} =~ s/(^[^\s]+).*$/$1/ ; + } +} + _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS [at] lists https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs
|