Skip to content

Commit 3dddfa6

Browse files
authored
add parser check. (#392)
1 parent 3fd0a9a commit 3dddfa6

10 files changed

+307
-7
lines changed

.github/workflows/commit_check.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
- '!master'
88

99
jobs:
10-
releases:
10+
commit_check:
1111
name: Check commit
1212
runs-on: macos-latest
1313
timeout-minutes: 30

.github/workflows/parser_check.yml

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: Parser Check
2+
3+
on:
4+
# Allow manual builds of this workflow.
5+
workflow_dispatch: { }
6+
push:
7+
branches:
8+
- "master"
9+
# Run this job every hour.
10+
schedule:
11+
- cron: "0 * * * *"
12+
13+
jobs:
14+
parser_check:
15+
runs-on: ubuntu-latest
16+
timeout-minutes: 0.5
17+
permissions:
18+
issues: write
19+
steps:
20+
- uses: actions/checkout@v4
21+
- uses: dart-lang/setup-dart@v1
22+
- name: Verify comment text parser
23+
working-directory: ./scripts/bin
24+
run: |
25+
dart pub get
26+
dart parser_verifier.dart -t ${{ secrets.GITHUB_TOKEN }}

.github/workflows/publish_ios.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
name: Publish (iOS)
22

33
on:
4-
# Allow manual builds of this workflow
4+
# Allow manual builds of this workflow.
55
workflow_dispatch: {}
6-
# Run the workflow whenever a new tag named 'v*' is pushed
76
push:
87
branches:
98
- master

analysis_options.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ linter:
1111
analyzer:
1212
exclude:
1313
- "submodules/**"
14+
- "scripts/**"

pubspec.lock

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ packages:
3434
source: hosted
3535
version: "2.0.11"
3636
args:
37-
dependency: transitive
37+
dependency: "direct dev"
3838
description:
3939
name: args
40-
sha256: eef6c46b622e0494a36c5a12d10d77fb4e855501a91c1b9ef9339326e58f0596
40+
sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a"
4141
url: "https://pub.dev"
4242
source: hosted
43-
version: "2.4.2"
43+
version: "2.5.0"
4444
async:
4545
dependency: transitive
4646
description:

pubspec.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name: hacki
22
description: A Hacker News reader.
3-
version: 2.7.3+142
3+
version: 2.7.4+143
44
publish_to: none
55

66
environment:

scripts/analysis_options.yaml

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
include: package:very_good_analysis/analysis_options.5.0.0.yaml
2+
linter:
3+
rules:
4+
parameter_assignments: false
5+
public_member_api_docs: false
6+
library_private_types_in_public_api: false
7+
omit_local_variable_types: false
8+
one_member_abstracts: false
9+
always_specify_types: true
10+
avoid_print: false

scripts/bin/parser_verifier.dart

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import 'dart:async';
2+
3+
import 'package:args/args.dart';
4+
import 'package:dio/dio.dart';
5+
import 'package:html/dom.dart' hide Comment;
6+
import 'package:html/parser.dart';
7+
import 'package:html_unescape/html_unescape.dart';
8+
9+
Future<void> main(List<String> arguments) async {
10+
/// Get the GitHub token from args for so that we can create issues if
11+
/// anything doesn't go as expected.
12+
final ArgParser parser = ArgParser()
13+
..addFlag('github-token', negatable: false, abbr: 't');
14+
final ArgResults argResults = parser.parse(arguments);
15+
final String token = argResults.rest.first;
16+
17+
/// The expected parser result.
18+
const String text = '''
19+
What does it say about the world we live in where blogs do more basic journalism than CNN? All that one would have had to do is read the report actually provided.
20+
21+
I don't think I'm being too extreme when I say that, apart from maybe PBS, there is no reputable source of news in America. If you don't believe me, pick a random story, watch it as it gets rewritten a million times through Reuters, then check back on the facts of the story one year later. A news story gets twisted to promote some narrative that will sell papers, and when the facts of the story are finally verified (usually not by the news themselves, but lawyers or courts or whoever), the story is dropped and never reported on again.
22+
23+
Again, if the only thing a reporter had to do was read the report to find the facts of the case to verify what is and isn't true, what the fuck is even the point of a news agency?''';
24+
25+
/// Get HTML of the thread.
26+
const String itemBaseUrl = 'https://news.ycombinator.com/item?id=';
27+
const Map<String, String> headers = <String, String>{
28+
'accept': '*/*',
29+
'user-agent':
30+
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1',
31+
};
32+
const int itemId = 11536543;
33+
final Dio dio = Dio();
34+
final Uri url = Uri.parse('$itemBaseUrl$itemId');
35+
final Options option = Options(headers: headers, persistentConnection: true);
36+
final Response<String> response =
37+
await dio.getUri<String>(url, options: option);
38+
39+
/// Parse the HTML and select all the comment elements.
40+
final String data = response.data ?? '';
41+
final Document document = parse(data);
42+
const String athingComtrSelector =
43+
'#hnmain > tbody > tr > td > table > tbody > .athing.comtr';
44+
final List<Element> elements = document.querySelectorAll(athingComtrSelector);
45+
46+
/// Verify comment text parser using the first comment element.
47+
if (elements.isNotEmpty) {
48+
final Element e = elements.first;
49+
const String commentTextSelector =
50+
'''td > table > tbody > tr > td.default > div.comment > div.commtext''';
51+
final Element? cmtTextElement = e.querySelector(commentTextSelector);
52+
final String parsedText =
53+
await parseCommentTextHtml(cmtTextElement?.innerHtml ?? '');
54+
55+
if (parsedText != text) {
56+
final Uri url =
57+
Uri.parse('https://api.github.com/repos/livinglist/hacki/issues');
58+
const String issueTitle = 'Parser check failed.';
59+
60+
/// Check if an issue with same title already exists.
61+
final Response<String> response = await dio.getUri<String>(url);
62+
if (response.data?.contains(issueTitle) ?? false) {
63+
print('Issue already exists.');
64+
return;
65+
}
66+
67+
/// Create the issue if one does not exist.
68+
final Map<String, String> githubHeaders = <String, String>{
69+
'Authorization': 'Bearer $token',
70+
'X-GitHub-Api-Version': '2022-11-28',
71+
'Content-Type': 'application/json',
72+
};
73+
final Map<String, dynamic> githubIssuePayload = <String, dynamic>{
74+
'title': issueTitle,
75+
'body': '''
76+
| Expected | Actual |
77+
| ------------- | ------------- |
78+
| ${text.replaceAll('\n', '<br>')} | ${parsedText.replaceAll('\n', '<br>')} |''',
79+
};
80+
await dio.postUri<String>(
81+
url,
82+
data: githubIssuePayload,
83+
options: Options(
84+
headers: githubHeaders,
85+
),
86+
);
87+
}
88+
} else {
89+
throw Exception('No comment from Hacker News.');
90+
}
91+
}
92+
93+
Future<String> parseCommentTextHtml(String text) async {
94+
return HtmlUnescape()
95+
.convert(text)
96+
.replaceAllMapped(
97+
RegExp(
98+
r'\<div class="reply"\>(.*?)\<\/div\>',
99+
dotAll: true,
100+
),
101+
(Match match) => '',
102+
)
103+
.replaceAllMapped(
104+
RegExp(
105+
r'\<span class="(.*?)"\>(.*?)\<\/span\>',
106+
dotAll: true,
107+
),
108+
(Match match) => '${match[2]}',
109+
)
110+
.replaceAllMapped(
111+
RegExp(
112+
r'\<p\>(.*?)\<\/p\>',
113+
dotAll: true,
114+
),
115+
(Match match) => '\n\n${match[1]}',
116+
)
117+
.replaceAllMapped(
118+
RegExp(r'\<a href=\"(.*?)\".*?\>.*?\<\/a\>'),
119+
(Match match) => match[1] ?? '',
120+
)
121+
.replaceAllMapped(
122+
RegExp(r'\<i\>(.*?)\<\/i\>'),
123+
(Match match) => '*${match[1]}*',
124+
)
125+
.trim();
126+
}

scripts/pubspec.lock

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Generated by pub
2+
# See https://dart.dev/tools/pub/glossary#lockfile
3+
packages:
4+
args:
5+
dependency: "direct main"
6+
description:
7+
name: args
8+
sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a"
9+
url: "https://pub.dev"
10+
source: hosted
11+
version: "2.5.0"
12+
async:
13+
dependency: transitive
14+
description:
15+
name: async
16+
sha256: "947bfcf187f74dbc5e146c9eb9c0f10c9f8b30743e341481c1e2ed3ecc18c20c"
17+
url: "https://pub.dev"
18+
source: hosted
19+
version: "2.11.0"
20+
collection:
21+
dependency: transitive
22+
description:
23+
name: collection
24+
sha256: ee67cb0715911d28db6bf4af1026078bd6f0128b07a5f66fb2ed94ec6783c09a
25+
url: "https://pub.dev"
26+
source: hosted
27+
version: "1.18.0"
28+
csslib:
29+
dependency: transitive
30+
description:
31+
name: csslib
32+
sha256: "706b5707578e0c1b4b7550f64078f0a0f19dec3f50a178ffae7006b0a9ca58fb"
33+
url: "https://pub.dev"
34+
source: hosted
35+
version: "1.0.0"
36+
dio:
37+
dependency: "direct main"
38+
description:
39+
name: dio
40+
sha256: "11e40df547d418cc0c4900a9318b26304e665da6fa4755399a9ff9efd09034b5"
41+
url: "https://pub.dev"
42+
source: hosted
43+
version: "5.4.3+1"
44+
html:
45+
dependency: "direct main"
46+
description:
47+
name: html
48+
sha256: "3a7812d5bcd2894edf53dfaf8cd640876cf6cef50a8f238745c8b8120ea74d3a"
49+
url: "https://pub.dev"
50+
source: hosted
51+
version: "0.15.4"
52+
html_unescape:
53+
dependency: "direct main"
54+
description:
55+
name: html_unescape
56+
sha256: "15362d7a18f19d7b742ef8dcb811f5fd2a2df98db9f80ea393c075189e0b61e3"
57+
url: "https://pub.dev"
58+
source: hosted
59+
version: "2.0.0"
60+
http_parser:
61+
dependency: transitive
62+
description:
63+
name: http_parser
64+
sha256: "2aa08ce0341cc9b354a498388e30986515406668dbcc4f7c950c3e715496693b"
65+
url: "https://pub.dev"
66+
source: hosted
67+
version: "4.0.2"
68+
meta:
69+
dependency: transitive
70+
description:
71+
name: meta
72+
sha256: bdb68674043280c3428e9ec998512fb681678676b3c54e773629ffe74419f8c7
73+
url: "https://pub.dev"
74+
source: hosted
75+
version: "1.15.0"
76+
path:
77+
dependency: transitive
78+
description:
79+
name: path
80+
sha256: "087ce49c3f0dc39180befefc60fdb4acd8f8620e5682fe2476afd0b3688bb4af"
81+
url: "https://pub.dev"
82+
source: hosted
83+
version: "1.9.0"
84+
source_span:
85+
dependency: transitive
86+
description:
87+
name: source_span
88+
sha256: "53e943d4206a5e30df338fd4c6e7a077e02254531b138a15aec3bd143c1a8b3c"
89+
url: "https://pub.dev"
90+
source: hosted
91+
version: "1.10.0"
92+
string_scanner:
93+
dependency: transitive
94+
description:
95+
name: string_scanner
96+
sha256: "556692adab6cfa87322a115640c11f13cb77b3f076ddcc5d6ae3c20242bedcde"
97+
url: "https://pub.dev"
98+
source: hosted
99+
version: "1.2.0"
100+
term_glyph:
101+
dependency: transitive
102+
description:
103+
name: term_glyph
104+
sha256: a29248a84fbb7c79282b40b8c72a1209db169a2e0542bce341da992fe1bc7e84
105+
url: "https://pub.dev"
106+
source: hosted
107+
version: "1.2.1"
108+
typed_data:
109+
dependency: transitive
110+
description:
111+
name: typed_data
112+
sha256: facc8d6582f16042dd49f2463ff1bd6e2c9ef9f3d5da3d9b087e244a7b564b3c
113+
url: "https://pub.dev"
114+
source: hosted
115+
version: "1.3.2"
116+
very_good_analysis:
117+
dependency: "direct dev"
118+
description:
119+
name: very_good_analysis
120+
sha256: "9ae7f3a3bd5764fb021b335ca28a34f040cd0ab6eec00a1b213b445dae58a4b8"
121+
url: "https://pub.dev"
122+
source: hosted
123+
version: "5.1.0"
124+
sdks:
125+
dart: ">=3.0.0 <4.0.0"

scripts/pubspec.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
name: parser_verifier
2+
3+
environment:
4+
sdk: ">=3.0.0 <4.0.0"
5+
6+
dependencies:
7+
args: ^2.5.0
8+
dio: ^5.0.3
9+
html: ^0.15.1
10+
html_unescape: ^2.0.0
11+
12+
dev_dependencies:
13+
very_good_analysis: ^5.0.0

0 commit comments

Comments
 (0)